KevinHuSh
commited on
Commit
·
279ca43
1
Parent(s):
ae21b62
fix task cancling bug (#98)
Browse files- api/apps/document_app.py +1 -2
- api/db/services/task_service.py +3 -2
- deepdoc/parser/pdf_parser.py +1 -1
- docker/.env +1 -1
- rag/app/book.py +1 -1
- rag/app/laws.py +2 -2
- rag/app/manual.py +1 -1
- rag/app/naive.py +1 -1
- rag/app/paper.py +1 -1
- rag/nlp/search.py +8 -1
- rag/svr/task_executor.py +4 -2
api/apps/document_app.py
CHANGED
@@ -316,8 +316,7 @@ def change_parser():
|
|
316 |
return get_data_error_result(retmsg="Not supported yet!")
|
317 |
|
318 |
e = DocumentService.update_by_id(doc.id,
|
319 |
-
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"
|
320 |
-
"token_num": 0, "chunk_num": 0, "process_duation": 0})
|
321 |
if not e:
|
322 |
return get_data_error_result(retmsg="Document not found!")
|
323 |
if doc.token_num > 0:
|
|
|
316 |
return get_data_error_result(retmsg="Not supported yet!")
|
317 |
|
318 |
e = DocumentService.update_by_id(doc.id,
|
319 |
+
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
|
|
|
320 |
if not e:
|
321 |
return get_data_error_result(retmsg="Document not found!")
|
322 |
if doc.token_num > 0:
|
api/db/services/task_service.py
CHANGED
@@ -73,8 +73,9 @@ class TaskService(CommonService):
|
|
73 |
@classmethod
|
74 |
@DB.connection_context()
|
75 |
def update_progress(cls, id, info):
|
76 |
-
|
77 |
-
cls.model.
|
|
|
78 |
if "progress" in info:
|
79 |
cls.model.update(progress=info["progress"]).where(
|
80 |
cls.model.id == id).execute()
|
|
|
73 |
@classmethod
|
74 |
@DB.connection_context()
|
75 |
def update_progress(cls, id, info):
|
76 |
+
if info["progress_msg"]:
|
77 |
+
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
|
78 |
+
cls.model.id == id).execute()
|
79 |
if "progress" in info:
|
80 |
cls.model.update(progress=info["progress"]).where(
|
81 |
cls.model.id == id).execute()
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -725,7 +725,7 @@ class HuParser:
|
|
725 |
(cropout(
|
726 |
bxs,
|
727 |
"figure", poss),
|
728 |
-
[txt]
|
729 |
positions.append(poss)
|
730 |
|
731 |
for k, bxs in tables.items():
|
|
|
725 |
(cropout(
|
726 |
bxs,
|
727 |
"figure", poss),
|
728 |
+
[txt]))
|
729 |
positions.append(poss)
|
730 |
|
731 |
for k, bxs in tables.items():
|
docker/.env
CHANGED
@@ -16,7 +16,7 @@ MEM_LIMIT=4073741824
|
|
16 |
MYSQL_PASSWORD=infini_rag_flow
|
17 |
MYSQL_PORT=5455
|
18 |
|
19 |
-
MINIO_USER=
|
20 |
MINIO_PASSWORD=infini_rag_flow
|
21 |
|
22 |
SVR_HTTP_PORT=9380
|
|
|
16 |
MYSQL_PASSWORD=infini_rag_flow
|
17 |
MYSQL_PORT=5455
|
18 |
|
19 |
+
MINIO_USER=rag_flow
|
20 |
MINIO_PASSWORD=infini_rag_flow
|
21 |
|
22 |
SVR_HTTP_PORT=9380
|
rag/app/book.py
CHANGED
@@ -28,7 +28,7 @@ class Pdf(PdfParser):
|
|
28 |
from_page,
|
29 |
to_page,
|
30 |
callback)
|
31 |
-
callback("OCR finished")
|
32 |
|
33 |
from timeit import default_timer as timer
|
34 |
start = timer()
|
|
|
28 |
from_page,
|
29 |
to_page,
|
30 |
callback)
|
31 |
+
callback(msg="OCR finished")
|
32 |
|
33 |
from timeit import default_timer as timer
|
34 |
start = timer()
|
rag/app/laws.py
CHANGED
@@ -57,7 +57,7 @@ class Pdf(PdfParser):
|
|
57 |
to_page,
|
58 |
callback
|
59 |
)
|
60 |
-
callback("OCR finished")
|
61 |
|
62 |
from timeit import default_timer as timer
|
63 |
start = timer()
|
@@ -135,6 +135,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
135 |
|
136 |
if __name__ == "__main__":
|
137 |
import sys
|
138 |
-
def dummy(
|
139 |
pass
|
140 |
chunk(sys.argv[1], callback=dummy)
|
|
|
57 |
to_page,
|
58 |
callback
|
59 |
)
|
60 |
+
callback(msg="OCR finished")
|
61 |
|
62 |
from timeit import default_timer as timer
|
63 |
start = timer()
|
|
|
135 |
|
136 |
if __name__ == "__main__":
|
137 |
import sys
|
138 |
+
def dummy(prog=None, msg=""):
|
139 |
pass
|
140 |
chunk(sys.argv[1], callback=dummy)
|
rag/app/manual.py
CHANGED
@@ -22,7 +22,7 @@ class Pdf(PdfParser):
|
|
22 |
to_page,
|
23 |
callback
|
24 |
)
|
25 |
-
callback("OCR finished.")
|
26 |
|
27 |
from timeit import default_timer as timer
|
28 |
start = timer()
|
|
|
22 |
to_page,
|
23 |
callback
|
24 |
)
|
25 |
+
callback(msg="OCR finished.")
|
26 |
|
27 |
from timeit import default_timer as timer
|
28 |
start = timer()
|
rag/app/naive.py
CHANGED
@@ -29,7 +29,7 @@ class Pdf(PdfParser):
|
|
29 |
to_page,
|
30 |
callback
|
31 |
)
|
32 |
-
callback("OCR finished")
|
33 |
|
34 |
from timeit import default_timer as timer
|
35 |
start = timer()
|
|
|
29 |
to_page,
|
30 |
callback
|
31 |
)
|
32 |
+
callback(msg="OCR finished")
|
33 |
|
34 |
from timeit import default_timer as timer
|
35 |
start = timer()
|
rag/app/paper.py
CHANGED
@@ -36,7 +36,7 @@ class Pdf(PdfParser):
|
|
36 |
to_page,
|
37 |
callback
|
38 |
)
|
39 |
-
callback("OCR finished.")
|
40 |
|
41 |
from timeit import default_timer as timer
|
42 |
start = timer()
|
|
|
36 |
to_page,
|
37 |
callback
|
38 |
)
|
39 |
+
callback(msg="OCR finished.")
|
40 |
|
41 |
from timeit import default_timer as timer
|
42 |
start = timer()
|
rag/nlp/search.py
CHANGED
@@ -305,8 +305,15 @@ class Dealer:
|
|
305 |
"similarity": sim[i],
|
306 |
"vector_similarity": vsim[i],
|
307 |
"term_similarity": tsim[i],
|
308 |
-
"vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim)))
|
|
|
309 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
ranks["chunks"].append(d)
|
311 |
if dnm not in ranks["doc_aggs"]:
|
312 |
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
|
|
305 |
"similarity": sim[i],
|
306 |
"vector_similarity": vsim[i],
|
307 |
"term_similarity": tsim[i],
|
308 |
+
"vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim))),
|
309 |
+
"positions": sres.field[id].get("position_int", "").split("\t")
|
310 |
}
|
311 |
+
if len(d["positions"]) % 5 == 0:
|
312 |
+
poss = []
|
313 |
+
for i in range(0, len(d["positions"]), 5):
|
314 |
+
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
|
315 |
+
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
|
316 |
+
d["positions"] = poss
|
317 |
ranks["chunks"].append(d)
|
318 |
if dnm not in ranks["doc_aggs"]:
|
319 |
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
rag/svr/task_executor.py
CHANGED
@@ -25,6 +25,7 @@ import traceback
|
|
25 |
from functools import partial
|
26 |
from timeit import default_timer as timer
|
27 |
|
|
|
28 |
from elasticsearch_dsl import Q
|
29 |
|
30 |
from api.db.services.task_service import TaskService
|
@@ -177,10 +178,11 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
|
177 |
tts, c = mdl.encode(tts)
|
178 |
tk_count += c
|
179 |
|
180 |
-
cnts_ = []
|
181 |
for i in range(0, len(cnts), 32):
|
182 |
vts, c = mdl.encode(cnts[i: i+32])
|
183 |
-
cnts_
|
|
|
184 |
tk_count += c
|
185 |
callback(msg="")
|
186 |
cnts = cnts_
|
|
|
25 |
from functools import partial
|
26 |
from timeit import default_timer as timer
|
27 |
|
28 |
+
import numpy as np
|
29 |
from elasticsearch_dsl import Q
|
30 |
|
31 |
from api.db.services.task_service import TaskService
|
|
|
178 |
tts, c = mdl.encode(tts)
|
179 |
tk_count += c
|
180 |
|
181 |
+
cnts_ = np.array([])
|
182 |
for i in range(0, len(cnts), 32):
|
183 |
vts, c = mdl.encode(cnts[i: i+32])
|
184 |
+
if len(cnts_) == 0: cnts_ = vts
|
185 |
+
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
186 |
tk_count += c
|
187 |
callback(msg="")
|
188 |
cnts = cnts_
|