aopstudio commited on
Commit
b6ebcd9
·
1 Parent(s): e04709e

Place pdf's image at the correct position in QA parser (#1235)

Browse files

### What problem does this PR solve?

Place pdf's image at the correct position in QA parser

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. rag/app/qa.py +45 -6
rag/app/qa.py CHANGED
@@ -100,27 +100,69 @@ class Pdf(PdfParser):
100
  last_index = -1
101
  last_box = {'text':''}
102
  last_bull = None
 
 
 
 
 
 
 
 
103
  for box in self.boxes:
104
  section, line_tag = box['text'], self._line_tag(box, zoomin)
105
  has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
106
  last_box, last_index, last_bull = box, index, has_bull
 
 
 
107
  if not has_bull: # No question bullet
108
  if not last_q:
 
 
109
  continue
110
  else:
111
- last_a = f'{last_a}{section}'
112
- last_tag = f'{last_tag}{line_tag}'
 
 
 
 
 
 
 
 
113
  else:
114
  if last_q:
115
- qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
 
 
 
 
 
 
 
116
  last_q, last_a, last_tag = '', '', ''
117
  last_q = has_bull.group()
118
  _, end = has_bull.span()
119
  last_a = section[end:]
120
  last_tag = line_tag
 
 
121
  if last_q:
122
  qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
123
  return qai_list, tbls
 
 
 
 
 
 
 
 
 
 
 
 
124
  class Docx(DocxParser):
125
  def __init__(self):
126
  pass
@@ -324,14 +366,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
324
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
325
  callback(0.1, "Start to parse.")
326
  pdf_parser = Pdf()
327
- count = 0
328
  qai_list, tbls = pdf_parser(filename if not binary else binary,
329
  from_page=0, to_page=10000, callback=callback)
330
 
331
- res = tokenize_table(tbls, doc, eng)
332
 
333
  for q, a, image, poss in qai_list:
334
- count += 1
335
  res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
336
  return res
337
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
 
100
  last_index = -1
101
  last_box = {'text':''}
102
  last_bull = None
103
+ def sort_key(element):
104
+ tbls_pn = element[1][0][0]
105
+ tbls_top = element[1][0][3]
106
+ return tbls_pn, tbls_top
107
+ tbls.sort(key=sort_key)
108
+ tbl_index = 0
109
+ last_pn, last_bottom = 0, 0
110
+ tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
111
  for box in self.boxes:
112
  section, line_tag = box['text'], self._line_tag(box, zoomin)
113
  has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
114
  last_box, last_index, last_bull = box, index, has_bull
115
+ line_pn = float(line_tag.lstrip('@@').split('\t')[0])
116
+ line_top = float(line_tag.rstrip('##').split('\t')[3])
117
+ tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
118
  if not has_bull: # No question bullet
119
  if not last_q:
120
+ if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed
121
+ tbls_index += 1
122
  continue
123
  else:
124
+ sum_tag = line_tag
125
+ sum_section = section
126
+ while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
127
+ and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the middle of current answer
128
+ sum_tag = f'{tbl_tag}{sum_tag}'
129
+ sum_section = f'{tbl_text}{sum_section}'
130
+ tbl_index += 1
131
+ tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
132
+ last_a = f'{last_a}{sum_section}'
133
+ last_tag = f'{last_tag}{sum_tag}'
134
  else:
135
  if last_q:
136
+ while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
137
+ and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the end of last answer
138
+ last_tag = f'{last_tag}{tbl_tag}'
139
+ last_a = f'{last_a}{tbl_text}'
140
+ tbl_index += 1
141
+ tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
142
+ image, poss = self.crop(last_tag, need_position=True)
143
+ qai_list.append((last_q, last_a, image, poss))
144
  last_q, last_a, last_tag = '', '', ''
145
  last_q = has_bull.group()
146
  _, end = has_bull.span()
147
  last_a = section[end:]
148
  last_tag = line_tag
149
+ last_bottom = float(line_tag.rstrip('##').split('\t')[4])
150
+ last_pn = line_pn
151
  if last_q:
152
  qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
153
  return qai_list, tbls
154
+ def get_tbls_info(self, tbls, tbl_index):
155
+ if tbl_index >= len(tbls):
156
+ return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
157
+ tbl_pn = tbls[tbl_index][1][0][0]+1
158
+ tbl_left = tbls[tbl_index][1][0][1]
159
+ tbl_right = tbls[tbl_index][1][0][2]
160
+ tbl_top = tbls[tbl_index][1][0][3]
161
+ tbl_bottom = tbls[tbl_index][1][0][4]
162
+ tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
163
+ .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
164
+ tbl_text = ''.join(tbls[tbl_index][0][1])
165
+ return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text
166
  class Docx(DocxParser):
167
  def __init__(self):
168
  pass
 
366
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
367
  callback(0.1, "Start to parse.")
368
  pdf_parser = Pdf()
 
369
  qai_list, tbls = pdf_parser(filename if not binary else binary,
370
  from_page=0, to_page=10000, callback=callback)
371
 
 
372
 
373
  for q, a, image, poss in qai_list:
 
374
  res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
375
  return res
376
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):