aopstudio commited on
Commit
7d4a201
·
1 Parent(s): 867015d

Add docx support for manual parser (#1227)

Browse files

### What problem does this PR solve?

Add docx support for manual parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (3) hide show
  1. rag/app/manual.py +179 -74
  2. rag/app/qa.py +3 -9
  3. rag/nlp/__init__.py +6 -0
rag/app/manual.py CHANGED
@@ -18,10 +18,13 @@ import copy
18
  import re
19
 
20
  from api.db import ParserType
21
- from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
 
22
  from deepdoc.parser import PdfParser, PlainParser
23
  from rag.utils import num_tokens_from_string
24
-
 
 
25
 
26
  class Pdf(PdfParser):
27
  def __init__(self):
@@ -64,6 +67,98 @@ class Pdf(PdfParser):
64
  return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
65
  for i, b in enumerate(self.boxes)], tbls
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def chunk(filename, binary=None, from_page=0, to_page=100000,
69
  lang="Chinese", callback=None, **kwargs):
@@ -71,7 +166,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
71
  Only pdf is supported.
72
  """
73
  pdf_parser = None
74
-
 
 
 
 
 
 
75
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
76
  pdf_parser = Pdf() if kwargs.get(
77
  "parser_config", {}).get(
@@ -80,80 +181,84 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
80
  from_page=from_page, to_page=to_page, callback=callback)
81
  if sections and len(sections[0]) < 3:
82
  sections = [(t, l, [[0] * 5]) for t, l in sections]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- else:
85
- raise NotImplementedError("file type not supported yet(pdf supported)")
86
- doc = {
87
- "docnm_kwd": filename
88
- }
89
- doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
90
- doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
91
- # is it English
92
- eng = lang.lower() == "english" # pdf_parser.is_english
 
 
 
 
93
 
94
- # set pivot using the most frequent type of title,
95
- # then merge between 2 pivot
96
- if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
97
- max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
98
- most_level = max(0, max_lvl - 1)
99
- levels = []
100
- for txt, _, _ in sections:
101
- for t, lvl in pdf_parser.outlines:
102
- tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
103
- tks_ = set([txt[i] + txt[i + 1]
104
- for i in range(min(len(t), len(txt) - 1))])
105
- if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
106
- levels.append(lvl)
107
- break
108
- else:
109
- levels.append(max_lvl + 1)
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  else:
112
- bull = bullets_category([txt for txt, _, _ in sections])
113
- most_level, levels = title_frequency(
114
- bull, [(txt, l) for txt, l, poss in sections])
115
-
116
- assert len(sections) == len(levels)
117
- sec_ids = []
118
- sid = 0
119
- for i, lvl in enumerate(levels):
120
- if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
121
- sid += 1
122
- sec_ids.append(sid)
123
- # print(lvl, self.boxes[i]["text"], most_level, sid)
124
-
125
- sections = [(txt, sec_ids[i], poss)
126
- for i, (txt, _, poss) in enumerate(sections)]
127
- for (img, rows), poss in tbls:
128
- if not rows: continue
129
- sections.append((rows if isinstance(rows, str) else rows[0], -1,
130
- [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
131
-
132
- def tag(pn, left, right, top, bottom):
133
- if pn + left + right + top + bottom == 0:
134
- return ""
135
- return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
136
- .format(pn, left, right, top, bottom)
137
-
138
- chunks = []
139
- last_sid = -2
140
- tk_cnt = 0
141
- for txt, sec_id, poss in sorted(sections, key=lambda x: (
142
- x[-1][0][0], x[-1][0][3], x[-1][0][1])):
143
- poss = "\t".join([tag(*pos) for pos in poss])
144
- if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
145
- if chunks:
146
- chunks[-1] += "\n" + txt + poss
147
- tk_cnt += num_tokens_from_string(txt)
148
- continue
149
- chunks.append(txt + poss)
150
- tk_cnt = num_tokens_from_string(txt)
151
- if sec_id > -1:
152
- last_sid = sec_id
153
-
154
- res = tokenize_table(tbls, doc, eng)
155
- res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
156
- return res
157
 
158
 
159
  if __name__ == "__main__":
@@ -164,4 +269,4 @@ if __name__ == "__main__":
164
  pass
165
 
166
 
167
- chunk(sys.argv[1], callback=dummy)
 
18
  import re
19
 
20
  from api.db import ParserType
21
+ from io import BytesIO
22
+ from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level
23
  from deepdoc.parser import PdfParser, PlainParser
24
  from rag.utils import num_tokens_from_string
25
+ from deepdoc.parser import PdfParser, ExcelParser, DocxParser
26
+ from docx import Document
27
+ from PIL import Image
28
 
29
  class Pdf(PdfParser):
30
  def __init__(self):
 
67
  return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
68
  for i, b in enumerate(self.boxes)], tbls
69
 
70
+ class Docx(DocxParser):
71
+ def __init__(self):
72
+ pass
73
+ def get_picture(self, document, paragraph):
74
+ img = paragraph._element.xpath('.//pic:pic')
75
+ if not img:
76
+ return None
77
+ img = img[0]
78
+ embed = img.xpath('.//a:blip/@r:embed')[0]
79
+ related_part = document.part.related_parts[embed]
80
+ image = related_part.image
81
+ image = Image.open(BytesIO(image.blob))
82
+ return image
83
+ def concat_img(self, img1, img2):
84
+ if img1 and not img2:
85
+ return img1
86
+ if not img1 and img2:
87
+ return img2
88
+ if not img1 and not img2:
89
+ return None
90
+ width1, height1 = img1.size
91
+ width2, height2 = img2.size
92
+
93
+ new_width = max(width1, width2)
94
+ new_height = height1 + height2
95
+ new_image = Image.new('RGB', (new_width, new_height))
96
+
97
+ new_image.paste(img1, (0, 0))
98
+ new_image.paste(img2, (0, height1))
99
+
100
+ return new_image
101
+
102
+ def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
103
+ self.doc = Document(
104
+ filename) if not binary else Document(BytesIO(binary))
105
+ pn = 0
106
+ last_answer, last_image = "", None
107
+ question_stack, level_stack = [], []
108
+ ti_list = []
109
+ for p in self.doc.paragraphs:
110
+ if pn > to_page:
111
+ break
112
+ question_level, p_text = 0, ''
113
+ if from_page <= pn < to_page and p.text.strip():
114
+ question_level, p_text = docx_question_level(p)
115
+ if not question_level or question_level > 6: # not a question
116
+ last_answer = f'{last_answer}\n{p_text}'
117
+ current_image = self.get_picture(self.doc, p)
118
+ last_image = self.concat_img(last_image, current_image)
119
+ else: # is a question
120
+ if last_answer or last_image:
121
+ sum_question = '\n'.join(question_stack)
122
+ if sum_question:
123
+ ti_list.append((f'{sum_question}\n{last_answer}', last_image))
124
+ last_answer, last_image = '', None
125
+
126
+ i = question_level
127
+ while question_stack and i <= level_stack[-1]:
128
+ question_stack.pop()
129
+ level_stack.pop()
130
+ question_stack.append(p_text)
131
+ level_stack.append(question_level)
132
+ for run in p.runs:
133
+ if 'lastRenderedPageBreak' in run._element.xml:
134
+ pn += 1
135
+ continue
136
+ if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
137
+ pn += 1
138
+ if last_answer:
139
+ sum_question = '\n'.join(question_stack)
140
+ if sum_question:
141
+ ti_list.append((f'{sum_question}\n{last_answer}', last_image))
142
+
143
+ tbls = []
144
+ for tb in self.doc.tables:
145
+ html= "<table>"
146
+ for r in tb.rows:
147
+ html += "<tr>"
148
+ i = 0
149
+ while i < len(r.cells):
150
+ span = 1
151
+ c = r.cells[i]
152
+ for j in range(i+1, len(r.cells)):
153
+ if c.text == r.cells[j].text:
154
+ span += 1
155
+ i = j
156
+ i += 1
157
+ html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
158
+ html += "</tr>"
159
+ html += "</table>"
160
+ tbls.append(((None, html), ""))
161
+ return ti_list, tbls
162
 
163
  def chunk(filename, binary=None, from_page=0, to_page=100000,
164
  lang="Chinese", callback=None, **kwargs):
 
166
  Only pdf is supported.
167
  """
168
  pdf_parser = None
169
+ doc = {
170
+ "docnm_kwd": filename
171
+ }
172
+ doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
173
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
174
+ # is it English
175
+ eng = lang.lower() == "english" # pdf_parser.is_english
176
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
177
  pdf_parser = Pdf() if kwargs.get(
178
  "parser_config", {}).get(
 
181
  from_page=from_page, to_page=to_page, callback=callback)
182
  if sections and len(sections[0]) < 3:
183
  sections = [(t, l, [[0] * 5]) for t, l in sections]
184
+ # set pivot using the most frequent type of title,
185
+ # then merge between 2 pivot
186
+ if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
187
+ max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
188
+ most_level = max(0, max_lvl - 1)
189
+ levels = []
190
+ for txt, _, _ in sections:
191
+ for t, lvl in pdf_parser.outlines:
192
+ tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
193
+ tks_ = set([txt[i] + txt[i + 1]
194
+ for i in range(min(len(t), len(txt) - 1))])
195
+ if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
196
+ levels.append(lvl)
197
+ break
198
+ else:
199
+ levels.append(max_lvl + 1)
200
 
201
+ else:
202
+ bull = bullets_category([txt for txt, _, _ in sections])
203
+ most_level, levels = title_frequency(
204
+ bull, [(txt, l) for txt, l, poss in sections])
205
+
206
+ assert len(sections) == len(levels)
207
+ sec_ids = []
208
+ sid = 0
209
+ for i, lvl in enumerate(levels):
210
+ if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
211
+ sid += 1
212
+ sec_ids.append(sid)
213
+ # print(lvl, self.boxes[i]["text"], most_level, sid)
214
 
215
+ sections = [(txt, sec_ids[i], poss)
216
+ for i, (txt, _, poss) in enumerate(sections)]
217
+ for (img, rows), poss in tbls:
218
+ if not rows: continue
219
+ sections.append((rows if isinstance(rows, str) else rows[0], -1,
220
+ [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
 
 
 
 
 
 
 
 
 
 
221
 
222
+ def tag(pn, left, right, top, bottom):
223
+ if pn + left + right + top + bottom == 0:
224
+ return ""
225
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
226
+ .format(pn, left, right, top, bottom)
227
+
228
+ chunks = []
229
+ last_sid = -2
230
+ tk_cnt = 0
231
+ for txt, sec_id, poss in sorted(sections, key=lambda x: (
232
+ x[-1][0][0], x[-1][0][3], x[-1][0][1])):
233
+ poss = "\t".join([tag(*pos) for pos in poss])
234
+ if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
235
+ if chunks:
236
+ chunks[-1] += "\n" + txt + poss
237
+ tk_cnt += num_tokens_from_string(txt)
238
+ continue
239
+ chunks.append(txt + poss)
240
+ tk_cnt = num_tokens_from_string(txt)
241
+ if sec_id > -1:
242
+ last_sid = sec_id
243
+
244
+ res = tokenize_table(tbls, doc, eng)
245
+ res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
246
+ return res
247
+ if re.search(r"\.docx$", filename, re.IGNORECASE):
248
+ docx_parser = Docx()
249
+ ti_list, tbls = docx_parser(filename, binary,
250
+ from_page=0, to_page=10000, callback=callback)
251
+ res = tokenize_table(tbls, doc, eng)
252
+ for text, image in ti_list:
253
+ d = copy.deepcopy(doc)
254
+ d['image'] = image
255
+ tokenize(d, text, eng)
256
+ res.append(d)
257
+ return res
258
  else:
259
+ raise NotImplementedError("file type not supported yet(pdf and docx supported)")
260
+
261
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
 
264
  if __name__ == "__main__":
 
269
  pass
270
 
271
 
272
+ chunk(sys.argv[1], callback=dummy)
rag/app/qa.py CHANGED
@@ -16,7 +16,7 @@ from io import BytesIO
16
  from timeit import default_timer as timer
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
- from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet
20
  from rag.nlp import rag_tokenizer, tokenize_table
21
  from rag.settings import cron_logger
22
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
@@ -165,7 +165,7 @@ class Docx(DocxParser):
165
  break
166
  question_level, p_text = 0, ''
167
  if from_page <= pn < to_page and p.text.strip():
168
- question_level, p_text = docxQuestionLevel(p)
169
  if not question_level or question_level > 6: # not a question
170
  last_answer = f'{last_answer}\n{p_text}'
171
  current_image = self.get_picture(self.doc, p)
@@ -254,12 +254,6 @@ def mdQuestionLevel(s):
254
  match = re.match(r'#*', s)
255
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
256
 
257
- def docxQuestionLevel(p):
258
- if p.style.name.startswith('Heading'):
259
- return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
260
- else:
261
- return 0, re.sub(r"\u3000", " ", p.text).strip()
262
-
263
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
264
  """
265
  Excel and csv(txt) format files are supported.
@@ -405,4 +399,4 @@ if __name__ == "__main__":
405
 
406
  def dummy(prog=None, msg=""):
407
  pass
408
- chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
 
16
  from timeit import default_timer as timer
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
+ from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
20
  from rag.nlp import rag_tokenizer, tokenize_table
21
  from rag.settings import cron_logger
22
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 
165
  break
166
  question_level, p_text = 0, ''
167
  if from_page <= pn < to_page and p.text.strip():
168
+ question_level, p_text = docx_question_level(p)
169
  if not question_level or question_level > 6: # not a question
170
  last_answer = f'{last_answer}\n{p_text}'
171
  current_image = self.get_picture(self.doc, p)
 
254
  match = re.match(r'#*', s)
255
  return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
256
 
 
 
 
 
 
 
257
  def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
258
  """
259
  Excel and csv(txt) format files are supported.
 
399
 
400
  def dummy(prog=None, msg=""):
401
  pass
402
+ chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
rag/nlp/__init__.py CHANGED
@@ -497,3 +497,9 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
497
  add_chunk(sec[s: e], pos)
498
 
499
  return cks
 
 
 
 
 
 
 
497
  add_chunk(sec[s: e], pos)
498
 
499
  return cks
500
+
501
+ def docx_question_level(p):
502
+ if p.style.name.startswith('Heading'):
503
+ return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
504
+ else:
505
+ return 0, re.sub(r"\u3000", " ", p.text).strip()