KevinHuSh
commited on
Commit
·
b83edb4
1
Parent(s):
830bf29
change callback strategy, add timezone to docker (#96)
Browse files- .gitignore +0 -1
- api/apps/document_app.py +14 -11
- deepdoc/parser/pdf_parser.py +5 -1
- docker/.env +3 -1
- docker/docker-compose.yml +6 -1
- rag/app/book.py +9 -8
- rag/app/laws.py +5 -3
- rag/app/manual.py +7 -5
- rag/app/naive.py +8 -8
- rag/app/paper.py +5 -3
- rag/app/presentation.py +1 -1
- rag/llm/embedding_model.py +1 -0
- rag/nlp/__init__.py +23 -4
- rag/svr/task_broker.py +1 -1
- rag/svr/task_executor.py +13 -5
.gitignore
CHANGED
@@ -20,5 +20,4 @@ Cargo.lock
|
|
20 |
*.trie
|
21 |
|
22 |
.idea/
|
23 |
-
.env
|
24 |
.vscode/
|
|
|
20 |
*.trie
|
21 |
|
22 |
.idea/
|
|
|
23 |
.vscode/
|
api/apps/document_app.py
CHANGED
@@ -141,7 +141,7 @@ def list():
|
|
141 |
try:
|
142 |
docs, tol = DocumentService.get_by_kb_id(
|
143 |
kb_id, page_number, items_per_page, orderby, desc, keywords)
|
144 |
-
return get_json_result(data={"total":tol, "docs": docs})
|
145 |
except Exception as e:
|
146 |
return server_error_response(e)
|
147 |
|
@@ -217,7 +217,7 @@ def rm():
|
|
217 |
return get_data_error_result(retmsg="Tenant not found!")
|
218 |
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
219 |
|
220 |
-
DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num
|
221 |
if not DocumentService.delete_by_id(req["doc_id"]):
|
222 |
return get_data_error_result(
|
223 |
retmsg="Database error (Document removal)!")
|
@@ -241,7 +241,7 @@ def run():
|
|
241 |
info["chunk_num"] = 0
|
242 |
info["token_num"] = 0
|
243 |
DocumentService.update_by_id(id, info)
|
244 |
-
#if str(req["run"]) == TaskStatus.CANCEL.value:
|
245 |
tenant_id = DocumentService.get_tenant_id(id)
|
246 |
if not tenant_id:
|
247 |
return get_data_error_result(retmsg="Tenant not found!")
|
@@ -281,7 +281,7 @@ def rename():
|
|
281 |
|
282 |
|
283 |
@manager.route('/get/<doc_id>', methods=['GET'])
|
284 |
-
|
285 |
def get(doc_id):
|
286 |
try:
|
287 |
e, doc = DocumentService.get_by_id(doc_id)
|
@@ -292,8 +292,9 @@ def get(doc_id):
|
|
292 |
ext = re.search(r"\.([^.]+)$", doc.name)
|
293 |
if ext:
|
294 |
if doc.type == FileType.VISUAL.value:
|
295 |
-
response.headers.set('Content-Type', 'image/%s'%ext.group(1))
|
296 |
-
else:
|
|
|
297 |
return response
|
298 |
except Exception as e:
|
299 |
return server_error_response(e)
|
@@ -314,11 +315,14 @@ def change_parser():
|
|
314 |
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
|
315 |
return get_data_error_result(retmsg="Not supported yet!")
|
316 |
|
317 |
-
e = DocumentService.update_by_id(doc.id,
|
|
|
|
|
318 |
if not e:
|
319 |
return get_data_error_result(retmsg="Document not found!")
|
320 |
-
if doc.token_num>0:
|
321 |
-
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num
|
|
|
322 |
if not e:
|
323 |
return get_data_error_result(retmsg="Document not found!")
|
324 |
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
@@ -332,7 +336,7 @@ def change_parser():
|
|
332 |
|
333 |
|
334 |
@manager.route('/image/<image_id>', methods=['GET'])
|
335 |
-
|
336 |
def get_image(image_id):
|
337 |
try:
|
338 |
bkt, nm = image_id.split("-")
|
@@ -341,4 +345,3 @@ def get_image(image_id):
|
|
341 |
return response
|
342 |
except Exception as e:
|
343 |
return server_error_response(e)
|
344 |
-
|
|
|
141 |
try:
|
142 |
docs, tol = DocumentService.get_by_kb_id(
|
143 |
kb_id, page_number, items_per_page, orderby, desc, keywords)
|
144 |
+
return get_json_result(data={"total": tol, "docs": docs})
|
145 |
except Exception as e:
|
146 |
return server_error_response(e)
|
147 |
|
|
|
217 |
return get_data_error_result(retmsg="Tenant not found!")
|
218 |
ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
219 |
|
220 |
+
DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
221 |
if not DocumentService.delete_by_id(req["doc_id"]):
|
222 |
return get_data_error_result(
|
223 |
retmsg="Database error (Document removal)!")
|
|
|
241 |
info["chunk_num"] = 0
|
242 |
info["token_num"] = 0
|
243 |
DocumentService.update_by_id(id, info)
|
244 |
+
# if str(req["run"]) == TaskStatus.CANCEL.value:
|
245 |
tenant_id = DocumentService.get_tenant_id(id)
|
246 |
if not tenant_id:
|
247 |
return get_data_error_result(retmsg="Tenant not found!")
|
|
|
281 |
|
282 |
|
283 |
@manager.route('/get/<doc_id>', methods=['GET'])
|
284 |
+
# @login_required
|
285 |
def get(doc_id):
|
286 |
try:
|
287 |
e, doc = DocumentService.get_by_id(doc_id)
|
|
|
292 |
ext = re.search(r"\.([^.]+)$", doc.name)
|
293 |
if ext:
|
294 |
if doc.type == FileType.VISUAL.value:
|
295 |
+
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
296 |
+
else:
|
297 |
+
response.headers.set('Content-Type', 'application/%s' % ext.group(1))
|
298 |
return response
|
299 |
except Exception as e:
|
300 |
return server_error_response(e)
|
|
|
315 |
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
|
316 |
return get_data_error_result(retmsg="Not supported yet!")
|
317 |
|
318 |
+
e = DocumentService.update_by_id(doc.id,
|
319 |
+
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0",
|
320 |
+
"token_num": 0, "chunk_num": 0, "process_duation": 0})
|
321 |
if not e:
|
322 |
return get_data_error_result(retmsg="Document not found!")
|
323 |
+
if doc.token_num > 0:
|
324 |
+
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
|
325 |
+
doc.process_duation * -1)
|
326 |
if not e:
|
327 |
return get_data_error_result(retmsg="Document not found!")
|
328 |
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
|
|
336 |
|
337 |
|
338 |
@manager.route('/image/<image_id>', methods=['GET'])
|
339 |
+
# @login_required
|
340 |
def get_image(image_id):
|
341 |
try:
|
342 |
bkt, nm = image_id.split("-")
|
|
|
345 |
return response
|
346 |
except Exception as e:
|
347 |
return server_error_response(e)
|
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -348,6 +348,9 @@ class HuParser:
|
|
348 |
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
|
349 |
bxs.pop(i)
|
350 |
continue
|
|
|
|
|
|
|
351 |
concatting_feats = [
|
352 |
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
|
353 |
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
@@ -856,7 +859,7 @@ class HuParser:
|
|
856 |
pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
|
857 |
return len(pdf)
|
858 |
|
859 |
-
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
|
860 |
self.lefted_chars = []
|
861 |
self.mean_height = []
|
862 |
self.mean_width = []
|
@@ -917,6 +920,7 @@ class HuParser:
|
|
917 |
# self.page_cum_height.append(
|
918 |
# np.max([c["bottom"] for c in chars]))
|
919 |
self.__ocr(i + 1, img, chars, zoomin)
|
|
|
920 |
|
921 |
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
|
922 |
bxes = [b for bxs in self.boxes for b in bxs]
|
|
|
348 |
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
|
349 |
bxs.pop(i)
|
350 |
continue
|
351 |
+
if not b["text"].strip():
|
352 |
+
bxs.pop(i)
|
353 |
+
continue
|
354 |
concatting_feats = [
|
355 |
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
|
356 |
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
|
|
|
859 |
pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
|
860 |
return len(pdf)
|
861 |
|
862 |
+
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
|
863 |
self.lefted_chars = []
|
864 |
self.mean_height = []
|
865 |
self.mean_width = []
|
|
|
920 |
# self.page_cum_height.append(
|
921 |
# np.max([c["bottom"] for c in chars]))
|
922 |
self.__ocr(i + 1, img, chars, zoomin)
|
923 |
+
if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
|
924 |
|
925 |
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
|
926 |
bxes = [b for bxs in self.boxes for b in bxs]
|
docker/.env
CHANGED
@@ -16,11 +16,13 @@ MEM_LIMIT=4073741824
|
|
16 |
MYSQL_PASSWORD=infini_rag_flow
|
17 |
MYSQL_PORT=5455
|
18 |
|
19 |
-
MINIO_USER=
|
20 |
MINIO_PASSWORD=infini_rag_flow
|
21 |
|
22 |
SVR_HTTP_PORT=9380
|
23 |
|
|
|
|
|
24 |
######## OS setup for ES ###########
|
25 |
# sysctl vm.max_map_count
|
26 |
# sudo sysctl -w vm.max_map_count=262144
|
|
|
16 |
MYSQL_PASSWORD=infini_rag_flow
|
17 |
MYSQL_PORT=5455
|
18 |
|
19 |
+
MINIO_USER=infiniflow
|
20 |
MINIO_PASSWORD=infini_rag_flow
|
21 |
|
22 |
SVR_HTTP_PORT=9380
|
23 |
|
24 |
+
TIMEZONE='Asia/Shanghai'
|
25 |
+
|
26 |
######## OS setup for ES ###########
|
27 |
# sysctl vm.max_map_count
|
28 |
# sudo sysctl -w vm.max_map_count=262144
|
docker/docker-compose.yml
CHANGED
@@ -14,6 +14,7 @@ services:
|
|
14 |
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
15 |
- bootstrap.memory_lock=false
|
16 |
- xpack.security.enabled=false
|
|
|
17 |
mem_limit: ${MEM_LIMIT}
|
18 |
ulimits:
|
19 |
memlock:
|
@@ -41,6 +42,7 @@ services:
|
|
41 |
environment:
|
42 |
- SERVERNAME=kibana
|
43 |
- ELASTICSEARCH_HOSTS=http://es01:9200
|
|
|
44 |
mem_limit: ${MEM_LIMIT}
|
45 |
networks:
|
46 |
- ragflow
|
@@ -50,7 +52,7 @@ services:
|
|
50 |
container_name: ragflow-mysql
|
51 |
environment:
|
52 |
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
|
53 |
-
- TZ
|
54 |
command:
|
55 |
--max_connections=1000
|
56 |
--character-set-server=utf8mb4
|
@@ -83,6 +85,7 @@ services:
|
|
83 |
environment:
|
84 |
- MINIO_ROOT_USER=${MINIO_USER}
|
85 |
- MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
|
|
|
86 |
volumes:
|
87 |
- minio_data:/data
|
88 |
networks:
|
@@ -108,6 +111,8 @@ services:
|
|
108 |
- ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
|
109 |
- ./nginx/proxy.conf:/etc/nginx/proxy.conf
|
110 |
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
|
|
|
|
|
111 |
networks:
|
112 |
- ragflow
|
113 |
restart: always
|
|
|
14 |
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
15 |
- bootstrap.memory_lock=false
|
16 |
- xpack.security.enabled=false
|
17 |
+
- TZ=${TIMEZONE}
|
18 |
mem_limit: ${MEM_LIMIT}
|
19 |
ulimits:
|
20 |
memlock:
|
|
|
42 |
environment:
|
43 |
- SERVERNAME=kibana
|
44 |
- ELASTICSEARCH_HOSTS=http://es01:9200
|
45 |
+
- TZ=${TIMEZONE}
|
46 |
mem_limit: ${MEM_LIMIT}
|
47 |
networks:
|
48 |
- ragflow
|
|
|
52 |
container_name: ragflow-mysql
|
53 |
environment:
|
54 |
- MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
|
55 |
+
- TZ=${TIMEZONE}
|
56 |
command:
|
57 |
--max_connections=1000
|
58 |
--character-set-server=utf8mb4
|
|
|
85 |
environment:
|
86 |
- MINIO_ROOT_USER=${MINIO_USER}
|
87 |
- MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
|
88 |
+
- TZ=${TIMEZONE}
|
89 |
volumes:
|
90 |
- minio_data:/data
|
91 |
networks:
|
|
|
111 |
- ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
|
112 |
- ./nginx/proxy.conf:/etc/nginx/proxy.conf
|
113 |
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
|
114 |
+
environment:
|
115 |
+
- TZ=${TIMEZONE}
|
116 |
networks:
|
117 |
- ragflow
|
118 |
restart: always
|
rag/app/book.py
CHANGED
@@ -26,26 +26,27 @@ class Pdf(PdfParser):
|
|
26 |
filename if not binary else binary,
|
27 |
zoomin,
|
28 |
from_page,
|
29 |
-
to_page
|
30 |
-
|
|
|
31 |
|
32 |
from timeit import default_timer as timer
|
33 |
start = timer()
|
34 |
self._layouts_rec(zoomin)
|
35 |
-
callback(0.
|
36 |
print("paddle layouts:", timer() - start)
|
37 |
self._table_transformer_job(zoomin)
|
38 |
callback(0.68, "Table analysis finished")
|
39 |
self._text_merge()
|
40 |
-
self.
|
|
|
41 |
self._filter_forpages()
|
42 |
self._merge_with_same_bullet()
|
43 |
callback(0.75, "Text merging finished.")
|
44 |
-
tbls = self._extract_table_figure(True, zoomin, True, True)
|
45 |
|
46 |
callback(0.8, "Text extraction finished")
|
47 |
|
48 |
-
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
49 |
|
50 |
|
51 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
@@ -92,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
92 |
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
93 |
if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
|
94 |
else:
|
95 |
-
sections = [s.split("@") for s in sections]
|
96 |
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
97 |
cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
|
98 |
|
@@ -116,6 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
116 |
|
117 |
if __name__ == "__main__":
|
118 |
import sys
|
119 |
-
def dummy(
|
120 |
pass
|
121 |
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
|
|
26 |
filename if not binary else binary,
|
27 |
zoomin,
|
28 |
from_page,
|
29 |
+
to_page,
|
30 |
+
callback)
|
31 |
+
callback("OCR finished")
|
32 |
|
33 |
from timeit import default_timer as timer
|
34 |
start = timer()
|
35 |
self._layouts_rec(zoomin)
|
36 |
+
callback(0.67, "Layout analysis finished")
|
37 |
print("paddle layouts:", timer() - start)
|
38 |
self._table_transformer_job(zoomin)
|
39 |
callback(0.68, "Table analysis finished")
|
40 |
self._text_merge()
|
41 |
+
tbls = self._extract_table_figure(True, zoomin, True, True)
|
42 |
+
self._naive_vertical_merge()
|
43 |
self._filter_forpages()
|
44 |
self._merge_with_same_bullet()
|
45 |
callback(0.75, "Text merging finished.")
|
|
|
46 |
|
47 |
callback(0.8, "Text extraction finished")
|
48 |
|
49 |
+
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
50 |
|
51 |
|
52 |
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
|
|
93 |
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
94 |
if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
|
95 |
else:
|
96 |
+
sections = [s.split("@") for s,_ in sections]
|
97 |
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
98 |
cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
|
99 |
|
|
|
117 |
|
118 |
if __name__ == "__main__":
|
119 |
import sys
|
120 |
+
def dummy(prog=None, msg=""):
|
121 |
pass
|
122 |
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
rag/app/laws.py
CHANGED
@@ -54,13 +54,15 @@ class Pdf(PdfParser):
|
|
54 |
filename if not binary else binary,
|
55 |
zoomin,
|
56 |
from_page,
|
57 |
-
to_page
|
58 |
-
|
|
|
|
|
59 |
|
60 |
from timeit import default_timer as timer
|
61 |
start = timer()
|
62 |
self._layouts_rec(zoomin)
|
63 |
-
callback(0.
|
64 |
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
65 |
self._naive_vertical_merge()
|
66 |
|
|
|
54 |
filename if not binary else binary,
|
55 |
zoomin,
|
56 |
from_page,
|
57 |
+
to_page,
|
58 |
+
callback
|
59 |
+
)
|
60 |
+
callback("OCR finished")
|
61 |
|
62 |
from timeit import default_timer as timer
|
63 |
start = timer()
|
64 |
self._layouts_rec(zoomin)
|
65 |
+
callback(0.67, "Layout analysis finished")
|
66 |
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
67 |
self._naive_vertical_merge()
|
68 |
|
rag/app/manual.py
CHANGED
@@ -19,20 +19,22 @@ class Pdf(PdfParser):
|
|
19 |
filename if not binary else binary,
|
20 |
zoomin,
|
21 |
from_page,
|
22 |
-
to_page
|
23 |
-
|
|
|
|
|
24 |
|
25 |
from timeit import default_timer as timer
|
26 |
start = timer()
|
27 |
self._layouts_rec(zoomin)
|
28 |
-
callback(0.
|
29 |
print("paddle layouts:", timer() - start)
|
30 |
self._table_transformer_job(zoomin)
|
31 |
-
callback(0.
|
32 |
self._text_merge()
|
33 |
self._concat_downward(concat_between_pages=False)
|
34 |
self._filter_forpages()
|
35 |
-
callback(0.
|
36 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
37 |
|
38 |
# clean mess
|
|
|
19 |
filename if not binary else binary,
|
20 |
zoomin,
|
21 |
from_page,
|
22 |
+
to_page,
|
23 |
+
callback
|
24 |
+
)
|
25 |
+
callback("OCR finished.")
|
26 |
|
27 |
from timeit import default_timer as timer
|
28 |
start = timer()
|
29 |
self._layouts_rec(zoomin)
|
30 |
+
callback(0.65, "Layout analysis finished.")
|
31 |
print("paddle layouts:", timer() - start)
|
32 |
self._table_transformer_job(zoomin)
|
33 |
+
callback(0.67, "Table analysis finished.")
|
34 |
self._text_merge()
|
35 |
self._concat_downward(concat_between_pages=False)
|
36 |
self._filter_forpages()
|
37 |
+
callback(0.68, "Text merging finished")
|
38 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
39 |
|
40 |
# clean mess
|
rag/app/naive.py
CHANGED
@@ -26,24 +26,24 @@ class Pdf(PdfParser):
|
|
26 |
filename if not binary else binary,
|
27 |
zoomin,
|
28 |
from_page,
|
29 |
-
to_page
|
30 |
-
|
|
|
|
|
31 |
|
32 |
from timeit import default_timer as timer
|
33 |
start = timer()
|
34 |
self._layouts_rec(zoomin)
|
35 |
-
callback(0.
|
36 |
print("paddle layouts:", timer() - start)
|
37 |
self._table_transformer_job(zoomin)
|
38 |
-
callback(0.
|
39 |
self._text_merge()
|
40 |
-
|
41 |
-
self._filter_forpages()
|
42 |
-
callback(0.77, "Text merging finished")
|
43 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
|
|
44 |
|
45 |
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
46 |
-
#self._naive_vertical_merge()
|
47 |
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
48 |
|
49 |
|
|
|
26 |
filename if not binary else binary,
|
27 |
zoomin,
|
28 |
from_page,
|
29 |
+
to_page,
|
30 |
+
callback
|
31 |
+
)
|
32 |
+
callback("OCR finished")
|
33 |
|
34 |
from timeit import default_timer as timer
|
35 |
start = timer()
|
36 |
self._layouts_rec(zoomin)
|
37 |
+
callback(0.63, "Layout analysis finished.")
|
38 |
print("paddle layouts:", timer() - start)
|
39 |
self._table_transformer_job(zoomin)
|
40 |
+
callback(0.65, "Table analysis finished.")
|
41 |
self._text_merge()
|
42 |
+
callback(0.67, "Text merging finished")
|
|
|
|
|
43 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
44 |
+
self._naive_vertical_merge()
|
45 |
|
46 |
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
|
|
47 |
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
48 |
|
49 |
|
rag/app/paper.py
CHANGED
@@ -33,13 +33,15 @@ class Pdf(PdfParser):
|
|
33 |
filename if not binary else binary,
|
34 |
zoomin,
|
35 |
from_page,
|
36 |
-
to_page
|
37 |
-
|
|
|
|
|
38 |
|
39 |
from timeit import default_timer as timer
|
40 |
start = timer()
|
41 |
self._layouts_rec(zoomin)
|
42 |
-
callback(0.
|
43 |
print("paddle layouts:", timer() - start)
|
44 |
self._table_transformer_job(zoomin)
|
45 |
callback(0.68, "Table analysis finished")
|
|
|
33 |
filename if not binary else binary,
|
34 |
zoomin,
|
35 |
from_page,
|
36 |
+
to_page,
|
37 |
+
callback
|
38 |
+
)
|
39 |
+
callback("OCR finished.")
|
40 |
|
41 |
from timeit import default_timer as timer
|
42 |
start = timer()
|
43 |
self._layouts_rec(zoomin)
|
44 |
+
callback(0.63, "Layout analysis finished")
|
45 |
print("paddle layouts:", timer() - start)
|
46 |
self._table_transformer_job(zoomin)
|
47 |
callback(0.68, "Table analysis finished")
|
rag/app/presentation.py
CHANGED
@@ -49,7 +49,7 @@ class Pdf(PdfParser):
|
|
49 |
|
50 |
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
51 |
callback(msg="OCR is running...")
|
52 |
-
self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
|
53 |
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
54 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
55 |
res = []
|
|
|
49 |
|
50 |
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
51 |
callback(msg="OCR is running...")
|
52 |
+
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
53 |
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
54 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
55 |
res = []
|
rag/llm/embedding_model.py
CHANGED
@@ -56,6 +56,7 @@ class HuEmbedding(Base):
|
|
56 |
|
57 |
|
58 |
def encode(self, texts: list, batch_size=32):
|
|
|
59 |
token_count = 0
|
60 |
for t in texts: token_count += num_tokens_from_string(t)
|
61 |
res = []
|
|
|
56 |
|
57 |
|
58 |
def encode(self, texts: list, batch_size=32):
|
59 |
+
texts = [t[:2000] for t in texts]
|
60 |
token_count = 0
|
61 |
for t in texts: token_count += num_tokens_from_string(t)
|
62 |
res = []
|
rag/nlp/__init__.py
CHANGED
@@ -114,6 +114,7 @@ def add_positions(d, poss):
|
|
114 |
d["page_num_int"].append(pn+1)
|
115 |
d["top_int"].append(top)
|
116 |
d["position_int"].append((pn+1, left, right, top, bottom))
|
|
|
117 |
|
118 |
|
119 |
def remove_contents_table(sections, eng=False):
|
@@ -172,7 +173,7 @@ def hierarchical_merge(bull, sections, depth):
|
|
172 |
|
173 |
def not_title(txt):
|
174 |
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
175 |
-
if len(txt) >=
|
176 |
return re.search(r"[,;,。;!!]", txt)
|
177 |
|
178 |
for i, (txt, layout) in enumerate(sections):
|
@@ -181,12 +182,12 @@ def hierarchical_merge(bull, sections, depth):
|
|
181 |
levels[j].append(i)
|
182 |
break
|
183 |
else:
|
184 |
-
if re.search(r"(title|head)", layout):
|
185 |
levels[bullets_size].append(i)
|
186 |
else:
|
187 |
levels[bullets_size + 1].append(i)
|
188 |
sections = [t for t, _ in sections]
|
189 |
-
for s in sections: print("--", s)
|
190 |
|
191 |
def binary_search(arr, target):
|
192 |
if not arr: return -1
|
@@ -220,11 +221,29 @@ def hierarchical_merge(bull, sections, depth):
|
|
220 |
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
221 |
cks[-1].append(levels[ii][jj])
|
222 |
for ii in cks[-1]: readed[ii] = True
|
|
|
|
|
|
|
223 |
for i in range(len(cks)):
|
224 |
cks[i] = [sections[j] for j in cks[i][::-1]]
|
225 |
print("--------------\n", "\n* ".join(cks[i]))
|
226 |
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
|
230 |
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
|
114 |
d["page_num_int"].append(pn+1)
|
115 |
d["top_int"].append(top)
|
116 |
d["position_int"].append((pn+1, left, right, top, bottom))
|
117 |
+
d["top_int"] = d["top_int"][:1]
|
118 |
|
119 |
|
120 |
def remove_contents_table(sections, eng=False):
|
|
|
173 |
|
174 |
def not_title(txt):
|
175 |
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
176 |
+
if len(txt.split(" "))>12 or (txt.find(" ")<0 and len(txt)) >= 32: return True
|
177 |
return re.search(r"[,;,。;!!]", txt)
|
178 |
|
179 |
for i, (txt, layout) in enumerate(sections):
|
|
|
182 |
levels[j].append(i)
|
183 |
break
|
184 |
else:
|
185 |
+
if re.search(r"(title|head)", layout) and not not_title(txt):
|
186 |
levels[bullets_size].append(i)
|
187 |
else:
|
188 |
levels[bullets_size + 1].append(i)
|
189 |
sections = [t for t, _ in sections]
|
190 |
+
#for s in sections: print("--", s)
|
191 |
|
192 |
def binary_search(arr, target):
|
193 |
if not arr: return -1
|
|
|
221 |
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
222 |
cks[-1].append(levels[ii][jj])
|
223 |
for ii in cks[-1]: readed[ii] = True
|
224 |
+
|
225 |
+
if not cks:return cks
|
226 |
+
|
227 |
for i in range(len(cks)):
|
228 |
cks[i] = [sections[j] for j in cks[i][::-1]]
|
229 |
print("--------------\n", "\n* ".join(cks[i]))
|
230 |
|
231 |
+
res = [[]]
|
232 |
+
num = [0]
|
233 |
+
for ck in cks:
|
234 |
+
if len(ck) == 1:
|
235 |
+
n = num_tokens_from_string(re.sub(r"@@[0-9]+.*", "", ck[0]))
|
236 |
+
if n + num[-1] < 218:
|
237 |
+
res[-1].append(ck[0])
|
238 |
+
num[-1] += n
|
239 |
+
continue
|
240 |
+
res.append(ck)
|
241 |
+
num.append(n)
|
242 |
+
continue
|
243 |
+
res.append(ck)
|
244 |
+
num.append(218)
|
245 |
+
|
246 |
+
return res
|
247 |
|
248 |
|
249 |
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
rag/svr/task_broker.py
CHANGED
@@ -46,7 +46,7 @@ def collect(tm):
|
|
46 |
def set_dispatching(docid):
|
47 |
try:
|
48 |
DocumentService.update_by_id(
|
49 |
-
docid, {"progress": random.
|
50 |
"progress_msg": "Task dispatched...",
|
51 |
"process_begin_at": get_format_time()
|
52 |
})
|
|
|
46 |
def set_dispatching(docid):
|
47 |
try:
|
48 |
DocumentService.update_by_id(
|
49 |
+
docid, {"progress": random.random()*1 / 100.,
|
50 |
"progress_msg": "Task dispatched...",
|
51 |
"process_begin_at": get_format_time()
|
52 |
})
|
rag/svr/task_executor.py
CHANGED
@@ -72,7 +72,8 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
|
72 |
prog = -1
|
73 |
|
74 |
if to_page > 0:
|
75 |
-
msg
|
|
|
76 |
d = {"progress_msg": msg}
|
77 |
if prog is not None:
|
78 |
d["progress"] = prog
|
@@ -168,7 +169,7 @@ def init_kb(row):
|
|
168 |
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
169 |
|
170 |
|
171 |
-
def embedding(docs, mdl, parser_config={}):
|
172 |
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
173 |
d["content_with_weight"] for d in docs]
|
174 |
tk_count = 0
|
@@ -176,8 +177,14 @@ def embedding(docs, mdl, parser_config={}):
|
|
176 |
tts, c = mdl.encode(tts)
|
177 |
tk_count += c
|
178 |
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
182 |
vects = (title_w * tts + (1 - title_w) *
|
183 |
cnts) if len(tts) == len(cnts) else cnts
|
@@ -218,10 +225,11 @@ def main(comm, mod):
|
|
218 |
# TODO: exception handler
|
219 |
## set_progress(r["did"], -1, "ERROR: ")
|
220 |
try:
|
221 |
-
tk_count = embedding(cks, embd_mdl, r["parser_config"])
|
222 |
except Exception as e:
|
223 |
callback(-1, "Embedding error:{}".format(str(e)))
|
224 |
cron_logger.error(str(e))
|
|
|
225 |
|
226 |
callback(msg="Finished embedding! Start to build index!")
|
227 |
init_kb(r)
|
|
|
72 |
prog = -1
|
73 |
|
74 |
if to_page > 0:
|
75 |
+
if msg:
|
76 |
+
msg = f"Page({from_page}~{to_page}): " + msg
|
77 |
d = {"progress_msg": msg}
|
78 |
if prog is not None:
|
79 |
d["progress"] = prog
|
|
|
169 |
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
170 |
|
171 |
|
172 |
+
def embedding(docs, mdl, parser_config={}, callback=None):
|
173 |
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
174 |
d["content_with_weight"] for d in docs]
|
175 |
tk_count = 0
|
|
|
177 |
tts, c = mdl.encode(tts)
|
178 |
tk_count += c
|
179 |
|
180 |
+
cnts_ = []
|
181 |
+
for i in range(0, len(cnts), 32):
|
182 |
+
vts, c = mdl.encode(cnts[i: i+32])
|
183 |
+
cnts_.extend(vts)
|
184 |
+
tk_count += c
|
185 |
+
callback(msg="")
|
186 |
+
cnts = cnts_
|
187 |
+
|
188 |
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
189 |
vects = (title_w * tts + (1 - title_w) *
|
190 |
cnts) if len(tts) == len(cnts) else cnts
|
|
|
225 |
# TODO: exception handler
|
226 |
## set_progress(r["did"], -1, "ERROR: ")
|
227 |
try:
|
228 |
+
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
229 |
except Exception as e:
|
230 |
callback(-1, "Embedding error:{}".format(str(e)))
|
231 |
cron_logger.error(str(e))
|
232 |
+
tk_count = 0
|
233 |
|
234 |
callback(msg="Finished embedding! Start to build index!")
|
235 |
init_kb(r)
|