KevinHuSh
commited on
Commit
·
4e03dc3
1
Parent(s):
3054c20
fix position extraction bug (#93)
Browse files* fix position extraction bug
* remove delimiter for naive parser
- api/apps/chunk_app.py +12 -8
- api/db/db_models.py +1 -1
- deepdoc/parser/pdf_parser.py +11 -5
- docker/entrypoint.sh +1 -1
- rag/app/book.py +1 -1
- rag/app/manual.py +1 -1
- rag/app/naive.py +1 -1
- rag/app/paper.py +1 -1
- rag/nlp/__init__.py +2 -0
- rag/nlp/search.py +1 -1
- rag/svr/task_broker.py +2 -2
api/apps/chunk_app.py
CHANGED
@@ -60,7 +60,8 @@ def list():
|
|
60 |
for id in sres.ids:
|
61 |
d = {
|
62 |
"chunk_id": id,
|
63 |
-
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get(
|
|
|
64 |
"doc_id": sres.field[id]["doc_id"],
|
65 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
66 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
@@ -68,10 +69,12 @@ def list():
|
|
68 |
"available_int": sres.field[id].get("available_int", 1),
|
69 |
"positions": sres.field[id].get("position_int", "").split("\t")
|
70 |
}
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
75 |
res["chunks"].append(d)
|
76 |
return get_json_result(data=res)
|
77 |
except Exception as e:
|
@@ -137,10 +140,10 @@ def set():
|
|
137 |
return get_data_error_result(retmsg="Document not found!")
|
138 |
|
139 |
if doc.parser_id == ParserType.QA:
|
140 |
-
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
|
141 |
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
|
142 |
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
143 |
-
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
|
144 |
|
145 |
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
146 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
@@ -189,7 +192,8 @@ def create():
|
|
189 |
md5 = hashlib.md5()
|
190 |
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
191 |
chunck_id = md5.hexdigest()
|
192 |
-
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
|
|
193 |
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
194 |
d["important_kwd"] = req.get("important_kwd", [])
|
195 |
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
|
|
60 |
for id in sres.ids:
|
61 |
d = {
|
62 |
"chunk_id": id,
|
63 |
+
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get(
|
64 |
+
"content_with_weight", ""),
|
65 |
"doc_id": sres.field[id]["doc_id"],
|
66 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
67 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
|
|
69 |
"available_int": sres.field[id].get("available_int", 1),
|
70 |
"positions": sres.field[id].get("position_int", "").split("\t")
|
71 |
}
|
72 |
+
if len(d["positions"]) % 5 == 0:
|
73 |
+
poss = []
|
74 |
+
for i in range(0, len(d["positions"]), 5):
|
75 |
+
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
|
76 |
+
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
|
77 |
+
d["positions"] = poss
|
78 |
res["chunks"].append(d)
|
79 |
return get_json_result(data=res)
|
80 |
except Exception as e:
|
|
|
140 |
return get_data_error_result(retmsg="Document not found!")
|
141 |
|
142 |
if doc.parser_id == ParserType.QA:
|
143 |
+
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1]
|
144 |
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
|
145 |
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
146 |
+
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a]))
|
147 |
|
148 |
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
149 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
|
|
192 |
md5 = hashlib.md5()
|
193 |
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
194 |
chunck_id = md5.hexdigest()
|
195 |
+
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
196 |
+
"content_with_weight": req["content_with_weight"]}
|
197 |
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
198 |
d["important_kwd"] = req.get("important_kwd", [])
|
199 |
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
api/db/db_models.py
CHANGED
@@ -527,7 +527,7 @@ class Dialog(DataBaseModel):
|
|
527 |
tenant_id = CharField(max_length=32, null=False)
|
528 |
name = CharField(max_length=255, null=True, help_text="dialog application name")
|
529 |
description = TextField(null=True, help_text="Dialog description")
|
530 |
-
icon =
|
531 |
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
532 |
llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
|
533 |
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
|
|
527 |
tenant_id = CharField(max_length=32, null=False)
|
528 |
name = CharField(max_length=255, null=True, help_text="dialog application name")
|
529 |
description = TextField(null=True, help_text="Dialog description")
|
530 |
+
icon = TextField(null=True, help_text="icon base64 string")
|
531 |
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
532 |
llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
|
533 |
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -35,6 +35,7 @@ class HuParser:
|
|
35 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
36 |
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
37 |
filename="updown_concat_xgb.model"))
|
|
|
38 |
"""
|
39 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
40 |
|
@@ -683,7 +684,7 @@ class HuParser:
|
|
683 |
"layoutno", "")))
|
684 |
|
685 |
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
686 |
-
poss.append((pn, left, right, top, bott))
|
687 |
return self.page_images[pn] \
|
688 |
.crop((left * ZM, top * ZM,
|
689 |
right * ZM, bott * ZM))
|
@@ -863,6 +864,7 @@ class HuParser:
|
|
863 |
self.garbages = {}
|
864 |
self.page_cum_height = [0]
|
865 |
self.page_layout = []
|
|
|
866 |
try:
|
867 |
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
868 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
@@ -947,7 +949,9 @@ class HuParser:
|
|
947 |
left, right, top, bottom = float(left), float(
|
948 |
right), float(top), float(bottom)
|
949 |
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
950 |
-
if not poss:
|
|
|
|
|
951 |
|
952 |
max_width = np.max([right-left for (_, left, right, _, _) in poss])
|
953 |
GAP = 6
|
@@ -969,7 +973,8 @@ class HuParser:
|
|
969 |
bottom, self.page_images[pns[0]].size[1])
|
970 |
))
|
971 |
)
|
972 |
-
|
|
|
973 |
bottom, self.page_images[pns[0]].size[1])/ZM))
|
974 |
bottom -= self.page_images[pns[0]].size[1]
|
975 |
for pn in pns[1:]:
|
@@ -980,8 +985,9 @@ class HuParser:
|
|
980 |
self.page_images[pn].size[1])
|
981 |
))
|
982 |
)
|
983 |
-
|
984 |
-
|
|
|
985 |
bottom -= self.page_images[pn].size[1]
|
986 |
|
987 |
if not imgs:
|
|
|
35 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
36 |
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
37 |
filename="updown_concat_xgb.model"))
|
38 |
+
self.page_from = 0
|
39 |
"""
|
40 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
41 |
|
|
|
684 |
"layoutno", "")))
|
685 |
|
686 |
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
687 |
+
poss.append((pn+self.page_from, left, right, top, bott))
|
688 |
return self.page_images[pn] \
|
689 |
.crop((left * ZM, top * ZM,
|
690 |
right * ZM, bott * ZM))
|
|
|
864 |
self.garbages = {}
|
865 |
self.page_cum_height = [0]
|
866 |
self.page_layout = []
|
867 |
+
self.page_from = page_from
|
868 |
try:
|
869 |
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
870 |
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
|
|
949 |
left, right, top, bottom = float(left), float(
|
950 |
right), float(top), float(bottom)
|
951 |
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
952 |
+
if not poss:
|
953 |
+
if need_position: return None, None
|
954 |
+
return
|
955 |
|
956 |
max_width = np.max([right-left for (_, left, right, _, _) in poss])
|
957 |
GAP = 6
|
|
|
973 |
bottom, self.page_images[pns[0]].size[1])
|
974 |
))
|
975 |
)
|
976 |
+
if 0 < ii < len(poss)-1:
|
977 |
+
positions.append((pns[0]+self.page_from, left, right, top, min(
|
978 |
bottom, self.page_images[pns[0]].size[1])/ZM))
|
979 |
bottom -= self.page_images[pns[0]].size[1]
|
980 |
for pn in pns[1:]:
|
|
|
985 |
self.page_images[pn].size[1])
|
986 |
))
|
987 |
)
|
988 |
+
if 0 < ii < len(poss) - 1:
|
989 |
+
positions.append((pn+self.page_from, left, right, 0, min(
|
990 |
+
bottom, self.page_images[pn].size[1]) / ZM))
|
991 |
bottom -= self.page_images[pn].size[1]
|
992 |
|
993 |
if not imgs:
|
docker/entrypoint.sh
CHANGED
@@ -10,7 +10,7 @@ PY=/root/miniconda3/envs/py11/bin/python
|
|
10 |
|
11 |
function task_exe(){
|
12 |
sleep 60;
|
13 |
-
while [ 1 -eq 1 ];do mpirun -n
|
14 |
}
|
15 |
|
16 |
function watch_broker(){
|
|
|
10 |
|
11 |
function task_exe(){
|
12 |
sleep 60;
|
13 |
+
while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done
|
14 |
}
|
15 |
|
16 |
function watch_broker(){
|
rag/app/book.py
CHANGED
@@ -41,7 +41,7 @@ class Pdf(PdfParser):
|
|
41 |
self._filter_forpages()
|
42 |
self._merge_with_same_bullet()
|
43 |
callback(0.75, "Text merging finished.")
|
44 |
-
tbls = self._extract_table_figure(True, zoomin,
|
45 |
|
46 |
callback(0.8, "Text extraction finished")
|
47 |
|
|
|
41 |
self._filter_forpages()
|
42 |
self._merge_with_same_bullet()
|
43 |
callback(0.75, "Text merging finished.")
|
44 |
+
tbls = self._extract_table_figure(True, zoomin, True, True)
|
45 |
|
46 |
callback(0.8, "Text extraction finished")
|
47 |
|
rag/app/manual.py
CHANGED
@@ -33,7 +33,7 @@ class Pdf(PdfParser):
|
|
33 |
self._concat_downward(concat_between_pages=False)
|
34 |
self._filter_forpages()
|
35 |
callback(0.77, "Text merging finished")
|
36 |
-
tbls = self._extract_table_figure(True, zoomin,
|
37 |
|
38 |
# clean mess
|
39 |
for b in self.boxes:
|
|
|
33 |
self._concat_downward(concat_between_pages=False)
|
34 |
self._filter_forpages()
|
35 |
callback(0.77, "Text merging finished")
|
36 |
+
tbls = self._extract_table_figure(True, zoomin, True, True)
|
37 |
|
38 |
# clean mess
|
39 |
for b in self.boxes:
|
rag/app/naive.py
CHANGED
@@ -40,7 +40,7 @@ class Pdf(PdfParser):
|
|
40 |
self._concat_downward(concat_between_pages=False)
|
41 |
self._filter_forpages()
|
42 |
callback(0.77, "Text merging finished")
|
43 |
-
tbls = self._extract_table_figure(True, zoomin,
|
44 |
|
45 |
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
46 |
#self._naive_vertical_merge()
|
|
|
40 |
self._concat_downward(concat_between_pages=False)
|
41 |
self._filter_forpages()
|
42 |
callback(0.77, "Text merging finished")
|
43 |
+
tbls = self._extract_table_figure(True, zoomin, True, True)
|
44 |
|
45 |
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
46 |
#self._naive_vertical_merge()
|
rag/app/paper.py
CHANGED
@@ -48,7 +48,7 @@ class Pdf(PdfParser):
|
|
48 |
self._concat_downward(concat_between_pages=False)
|
49 |
self._filter_forpages()
|
50 |
callback(0.75, "Text merging finished.")
|
51 |
-
tbls = self._extract_table_figure(True, zoomin,
|
52 |
|
53 |
# clean mess
|
54 |
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
|
|
48 |
self._concat_downward(concat_between_pages=False)
|
49 |
self._filter_forpages()
|
50 |
callback(0.75, "Text merging finished.")
|
51 |
+
tbls = self._extract_table_figure(True, zoomin, True, True)
|
52 |
|
53 |
# clean mess
|
54 |
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
rag/nlp/__init__.py
CHANGED
@@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
246 |
tk_nums[-1] += tnum
|
247 |
|
248 |
for sec, pos in sections:
|
|
|
|
|
249 |
s, e = 0, 1
|
250 |
while e < len(sec):
|
251 |
if sec[e] in delimiter:
|
|
|
246 |
tk_nums[-1] += tnum
|
247 |
|
248 |
for sec, pos in sections:
|
249 |
+
add_chunk(sec, pos)
|
250 |
+
continue
|
251 |
s, e = 0, 1
|
252 |
while e < len(sec):
|
253 |
if sec[e] in delimiter:
|
rag/nlp/search.py
CHANGED
@@ -83,7 +83,7 @@ class Dealer:
|
|
83 |
else:
|
84 |
s = s.sort(
|
85 |
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
|
86 |
-
{"top_int": {"order": "asc", "unmapped_type": "float"}},
|
87 |
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
88 |
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
89 |
)
|
|
|
83 |
else:
|
84 |
s = s.sort(
|
85 |
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
|
86 |
+
{"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
|
87 |
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
88 |
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
89 |
)
|
rag/svr/task_broker.py
CHANGED
@@ -83,10 +83,10 @@ def dispatch():
|
|
83 |
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
84 |
for s,e in r["parser_config"].get("pages", [(0,100000)]):
|
85 |
e = min(e, pages)
|
86 |
-
for p in range(s, e,
|
87 |
task = new_task()
|
88 |
task["from_page"] = p
|
89 |
-
task["to_page"] = min(p +
|
90 |
tsks.append(task)
|
91 |
else:
|
92 |
tsks.append(new_task())
|
|
|
83 |
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
84 |
for s,e in r["parser_config"].get("pages", [(0,100000)]):
|
85 |
e = min(e, pages)
|
86 |
+
for p in range(s, e, 5):
|
87 |
task = new_task()
|
88 |
task["from_page"] = p
|
89 |
+
task["to_page"] = min(p + 5, e)
|
90 |
tsks.append(task)
|
91 |
else:
|
92 |
tsks.append(new_task())
|