KevinHuSh commited on
Commit
4e03dc3
·
1 Parent(s): 3054c20

fix position extraction bug (#93)

Browse files

* fix position extraction bug

* remove delimiter for naive parser

api/apps/chunk_app.py CHANGED
@@ -60,7 +60,8 @@ def list():
60
  for id in sres.ids:
61
  d = {
62
  "chunk_id": id,
63
- "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get("content_with_weight", ""),
 
64
  "doc_id": sres.field[id]["doc_id"],
65
  "docnm_kwd": sres.field[id]["docnm_kwd"],
66
  "important_kwd": sres.field[id].get("important_kwd", []),
@@ -68,10 +69,12 @@ def list():
68
  "available_int": sres.field[id].get("available_int", 1),
69
  "positions": sres.field[id].get("position_int", "").split("\t")
70
  }
71
- poss = []
72
- for i in range(0, len(d["positions"]), 5):
73
- poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])])
74
- d["positions"] = poss
 
 
75
  res["chunks"].append(d)
76
  return get_json_result(data=res)
77
  except Exception as e:
@@ -137,10 +140,10 @@ def set():
137
  return get_data_error_result(retmsg="Document not found!")
138
 
139
  if doc.parser_id == ParserType.QA:
140
- arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
141
  if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
142
  q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
143
- d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
144
 
145
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
146
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
@@ -189,7 +192,8 @@ def create():
189
  md5 = hashlib.md5()
190
  md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
191
  chunck_id = md5.hexdigest()
192
- d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]}
 
193
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
194
  d["important_kwd"] = req.get("important_kwd", [])
195
  d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
 
60
  for id in sres.ids:
61
  d = {
62
  "chunk_id": id,
63
+ "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get(
64
+ "content_with_weight", ""),
65
  "doc_id": sres.field[id]["doc_id"],
66
  "docnm_kwd": sres.field[id]["docnm_kwd"],
67
  "important_kwd": sres.field[id].get("important_kwd", []),
 
69
  "available_int": sres.field[id].get("available_int", 1),
70
  "positions": sres.field[id].get("position_int", "").split("\t")
71
  }
72
+ if len(d["positions"]) % 5 == 0:
73
+ poss = []
74
+ for i in range(0, len(d["positions"]), 5):
75
+ poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
76
+ float(d["positions"][i + 3]), float(d["positions"][i + 4])])
77
+ d["positions"] = poss
78
  res["chunks"].append(d)
79
  return get_json_result(data=res)
80
  except Exception as e:
 
140
  return get_data_error_result(retmsg="Document not found!")
141
 
142
  if doc.parser_id == ParserType.QA:
143
+ arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1]
144
  if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
145
  q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
146
+ d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a]))
147
 
148
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
149
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
 
192
  md5 = hashlib.md5()
193
  md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
194
  chunck_id = md5.hexdigest()
195
+ d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
196
+ "content_with_weight": req["content_with_weight"]}
197
  d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
198
  d["important_kwd"] = req.get("important_kwd", [])
199
  d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
api/db/db_models.py CHANGED
@@ -527,7 +527,7 @@ class Dialog(DataBaseModel):
527
  tenant_id = CharField(max_length=32, null=False)
528
  name = CharField(max_length=255, null=True, help_text="dialog application name")
529
  description = TextField(null=True, help_text="Dialog description")
530
- icon = CharField(max_length=16, null=False, help_text="dialog icon")
531
  language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
532
  llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
533
  llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
 
527
  tenant_id = CharField(max_length=32, null=False)
528
  name = CharField(max_length=255, null=True, help_text="dialog application name")
529
  description = TextField(null=True, help_text="Dialog description")
530
+ icon = TextField(null=True, help_text="icon base64 string")
531
  language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
532
  llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
533
  llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
deepdoc/parser/pdf_parser.py CHANGED
@@ -35,6 +35,7 @@ class HuParser:
35
  self.updown_cnt_mdl.set_param({"device": "cuda"})
36
  self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
37
  filename="updown_concat_xgb.model"))
 
38
  """
39
  If you have trouble downloading HuggingFace models, -_^ this might help!!
40
 
@@ -683,7 +684,7 @@ class HuParser:
683
  "layoutno", "")))
684
 
685
  left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
686
- poss.append((pn, left, right, top, bott))
687
  return self.page_images[pn] \
688
  .crop((left * ZM, top * ZM,
689
  right * ZM, bott * ZM))
@@ -863,6 +864,7 @@ class HuParser:
863
  self.garbages = {}
864
  self.page_cum_height = [0]
865
  self.page_layout = []
 
866
  try:
867
  self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
868
  self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
@@ -947,7 +949,9 @@ class HuParser:
947
  left, right, top, bottom = float(left), float(
948
  right), float(top), float(bottom)
949
  poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
950
- if not poss: return
 
 
951
 
952
  max_width = np.max([right-left for (_, left, right, _, _) in poss])
953
  GAP = 6
@@ -969,7 +973,8 @@ class HuParser:
969
  bottom, self.page_images[pns[0]].size[1])
970
  ))
971
  )
972
- positions.append((pns[0], left, right, top, min(
 
973
  bottom, self.page_images[pns[0]].size[1])/ZM))
974
  bottom -= self.page_images[pns[0]].size[1]
975
  for pn in pns[1:]:
@@ -980,8 +985,9 @@ class HuParser:
980
  self.page_images[pn].size[1])
981
  ))
982
  )
983
- positions.append((pn, left, right, 0, min(
984
- bottom, self.page_images[pn].size[1]) / ZM))
 
985
  bottom -= self.page_images[pn].size[1]
986
 
987
  if not imgs:
 
35
  self.updown_cnt_mdl.set_param({"device": "cuda"})
36
  self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
37
  filename="updown_concat_xgb.model"))
38
+ self.page_from = 0
39
  """
40
  If you have trouble downloading HuggingFace models, -_^ this might help!!
41
 
 
684
  "layoutno", "")))
685
 
686
  left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
687
+ poss.append((pn+self.page_from, left, right, top, bott))
688
  return self.page_images[pn] \
689
  .crop((left * ZM, top * ZM,
690
  right * ZM, bott * ZM))
 
864
  self.garbages = {}
865
  self.page_cum_height = [0]
866
  self.page_layout = []
867
+ self.page_from = page_from
868
  try:
869
  self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
870
  self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
 
949
  left, right, top, bottom = float(left), float(
950
  right), float(top), float(bottom)
951
  poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
952
+ if not poss:
953
+ if need_position: return None, None
954
+ return
955
 
956
  max_width = np.max([right-left for (_, left, right, _, _) in poss])
957
  GAP = 6
 
973
  bottom, self.page_images[pns[0]].size[1])
974
  ))
975
  )
976
+ if 0 < ii < len(poss)-1:
977
+ positions.append((pns[0]+self.page_from, left, right, top, min(
978
  bottom, self.page_images[pns[0]].size[1])/ZM))
979
  bottom -= self.page_images[pns[0]].size[1]
980
  for pn in pns[1:]:
 
985
  self.page_images[pn].size[1])
986
  ))
987
  )
988
+ if 0 < ii < len(poss) - 1:
989
+ positions.append((pn+self.page_from, left, right, 0, min(
990
+ bottom, self.page_images[pn].size[1]) / ZM))
991
  bottom -= self.page_images[pn].size[1]
992
 
993
  if not imgs:
docker/entrypoint.sh CHANGED
@@ -10,7 +10,7 @@ PY=/root/miniconda3/envs/py11/bin/python
10
 
11
  function task_exe(){
12
  sleep 60;
13
- while [ 1 -eq 1 ];do mpirun -n 2 --allow-run-as-root $PY rag/svr/task_executor.py ; done
14
  }
15
 
16
  function watch_broker(){
 
10
 
11
  function task_exe(){
12
  sleep 60;
13
+ while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done
14
  }
15
 
16
  function watch_broker(){
rag/app/book.py CHANGED
@@ -41,7 +41,7 @@ class Pdf(PdfParser):
41
  self._filter_forpages()
42
  self._merge_with_same_bullet()
43
  callback(0.75, "Text merging finished.")
44
- tbls = self._extract_table_figure(True, zoomin, False, True)
45
 
46
  callback(0.8, "Text extraction finished")
47
 
 
41
  self._filter_forpages()
42
  self._merge_with_same_bullet()
43
  callback(0.75, "Text merging finished.")
44
+ tbls = self._extract_table_figure(True, zoomin, True, True)
45
 
46
  callback(0.8, "Text extraction finished")
47
 
rag/app/manual.py CHANGED
@@ -33,7 +33,7 @@ class Pdf(PdfParser):
33
  self._concat_downward(concat_between_pages=False)
34
  self._filter_forpages()
35
  callback(0.77, "Text merging finished")
36
- tbls = self._extract_table_figure(True, zoomin, False, True)
37
 
38
  # clean mess
39
  for b in self.boxes:
 
33
  self._concat_downward(concat_between_pages=False)
34
  self._filter_forpages()
35
  callback(0.77, "Text merging finished")
36
+ tbls = self._extract_table_figure(True, zoomin, True, True)
37
 
38
  # clean mess
39
  for b in self.boxes:
rag/app/naive.py CHANGED
@@ -40,7 +40,7 @@ class Pdf(PdfParser):
40
  self._concat_downward(concat_between_pages=False)
41
  self._filter_forpages()
42
  callback(0.77, "Text merging finished")
43
- tbls = self._extract_table_figure(True, zoomin, False, True)
44
 
45
  cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
46
  #self._naive_vertical_merge()
 
40
  self._concat_downward(concat_between_pages=False)
41
  self._filter_forpages()
42
  callback(0.77, "Text merging finished")
43
+ tbls = self._extract_table_figure(True, zoomin, True, True)
44
 
45
  cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
46
  #self._naive_vertical_merge()
rag/app/paper.py CHANGED
@@ -48,7 +48,7 @@ class Pdf(PdfParser):
48
  self._concat_downward(concat_between_pages=False)
49
  self._filter_forpages()
50
  callback(0.75, "Text merging finished.")
51
- tbls = self._extract_table_figure(True, zoomin, False, True)
52
 
53
  # clean mess
54
  if column_width < self.page_images[0].size[0] / zoomin / 2:
 
48
  self._concat_downward(concat_between_pages=False)
49
  self._filter_forpages()
50
  callback(0.75, "Text merging finished.")
51
+ tbls = self._extract_table_figure(True, zoomin, True, True)
52
 
53
  # clean mess
54
  if column_width < self.page_images[0].size[0] / zoomin / 2:
rag/nlp/__init__.py CHANGED
@@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
246
  tk_nums[-1] += tnum
247
 
248
  for sec, pos in sections:
 
 
249
  s, e = 0, 1
250
  while e < len(sec):
251
  if sec[e] in delimiter:
 
246
  tk_nums[-1] += tnum
247
 
248
  for sec, pos in sections:
249
+ add_chunk(sec, pos)
250
+ continue
251
  s, e = 0, 1
252
  while e < len(sec):
253
  if sec[e] in delimiter:
rag/nlp/search.py CHANGED
@@ -83,7 +83,7 @@ class Dealer:
83
  else:
84
  s = s.sort(
85
  {"page_num_int": {"order": "asc", "unmapped_type": "float"}},
86
- {"top_int": {"order": "asc", "unmapped_type": "float"}},
87
  {"create_time": {"order": "desc", "unmapped_type": "date"}},
88
  {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
89
  )
 
83
  else:
84
  s = s.sort(
85
  {"page_num_int": {"order": "asc", "unmapped_type": "float"}},
86
+ {"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
87
  {"create_time": {"order": "desc", "unmapped_type": "date"}},
88
  {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
89
  )
rag/svr/task_broker.py CHANGED
@@ -83,10 +83,10 @@ def dispatch():
83
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
84
  for s,e in r["parser_config"].get("pages", [(0,100000)]):
85
  e = min(e, pages)
86
- for p in range(s, e, 10):
87
  task = new_task()
88
  task["from_page"] = p
89
- task["to_page"] = min(p + 10, e)
90
  tsks.append(task)
91
  else:
92
  tsks.append(new_task())
 
83
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
84
  for s,e in r["parser_config"].get("pages", [(0,100000)]):
85
  e = min(e, pages)
86
+ for p in range(s, e, 5):
87
  task = new_task()
88
  task["from_page"] = p
89
+ task["to_page"] = min(p + 5, e)
90
  tsks.append(task)
91
  else:
92
  tsks.append(new_task())