KevinHuSh commited on
Commit
279ca43
·
1 Parent(s): ae21b62

fix task cancling bug (#98)

Browse files
api/apps/document_app.py CHANGED
@@ -316,8 +316,7 @@ def change_parser():
316
  return get_data_error_result(retmsg="Not supported yet!")
317
 
318
  e = DocumentService.update_by_id(doc.id,
319
- {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0",
320
- "token_num": 0, "chunk_num": 0, "process_duation": 0})
321
  if not e:
322
  return get_data_error_result(retmsg="Document not found!")
323
  if doc.token_num > 0:
 
316
  return get_data_error_result(retmsg="Not supported yet!")
317
 
318
  e = DocumentService.update_by_id(doc.id,
319
+ {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
 
320
  if not e:
321
  return get_data_error_result(retmsg="Document not found!")
322
  if doc.token_num > 0:
api/db/services/task_service.py CHANGED
@@ -73,8 +73,9 @@ class TaskService(CommonService):
73
  @classmethod
74
  @DB.connection_context()
75
  def update_progress(cls, id, info):
76
- cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
77
- cls.model.id == id).execute()
 
78
  if "progress" in info:
79
  cls.model.update(progress=info["progress"]).where(
80
  cls.model.id == id).execute()
 
73
  @classmethod
74
  @DB.connection_context()
75
  def update_progress(cls, id, info):
76
+ if info["progress_msg"]:
77
+ cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
78
+ cls.model.id == id).execute()
79
  if "progress" in info:
80
  cls.model.update(progress=info["progress"]).where(
81
  cls.model.id == id).execute()
deepdoc/parser/pdf_parser.py CHANGED
@@ -725,7 +725,7 @@ class HuParser:
725
  (cropout(
726
  bxs,
727
  "figure", poss),
728
- [txt] if not return_html else [f"<p>{txt}</p>"]))
729
  positions.append(poss)
730
 
731
  for k, bxs in tables.items():
 
725
  (cropout(
726
  bxs,
727
  "figure", poss),
728
+ [txt]))
729
  positions.append(poss)
730
 
731
  for k, bxs in tables.items():
docker/.env CHANGED
@@ -16,7 +16,7 @@ MEM_LIMIT=4073741824
16
  MYSQL_PASSWORD=infini_rag_flow
17
  MYSQL_PORT=5455
18
 
19
- MINIO_USER=infiniflow
20
  MINIO_PASSWORD=infini_rag_flow
21
 
22
  SVR_HTTP_PORT=9380
 
16
  MYSQL_PASSWORD=infini_rag_flow
17
  MYSQL_PORT=5455
18
 
19
+ MINIO_USER=rag_flow
20
  MINIO_PASSWORD=infini_rag_flow
21
 
22
  SVR_HTTP_PORT=9380
rag/app/book.py CHANGED
@@ -28,7 +28,7 @@ class Pdf(PdfParser):
28
  from_page,
29
  to_page,
30
  callback)
31
- callback("OCR finished")
32
 
33
  from timeit import default_timer as timer
34
  start = timer()
 
28
  from_page,
29
  to_page,
30
  callback)
31
+ callback(msg="OCR finished")
32
 
33
  from timeit import default_timer as timer
34
  start = timer()
rag/app/laws.py CHANGED
@@ -57,7 +57,7 @@ class Pdf(PdfParser):
57
  to_page,
58
  callback
59
  )
60
- callback("OCR finished")
61
 
62
  from timeit import default_timer as timer
63
  start = timer()
@@ -135,6 +135,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
135
 
136
  if __name__ == "__main__":
137
  import sys
138
- def dummy(a, b):
139
  pass
140
  chunk(sys.argv[1], callback=dummy)
 
57
  to_page,
58
  callback
59
  )
60
+ callback(msg="OCR finished")
61
 
62
  from timeit import default_timer as timer
63
  start = timer()
 
135
 
136
  if __name__ == "__main__":
137
  import sys
138
+ def dummy(prog=None, msg=""):
139
  pass
140
  chunk(sys.argv[1], callback=dummy)
rag/app/manual.py CHANGED
@@ -22,7 +22,7 @@ class Pdf(PdfParser):
22
  to_page,
23
  callback
24
  )
25
- callback("OCR finished.")
26
 
27
  from timeit import default_timer as timer
28
  start = timer()
 
22
  to_page,
23
  callback
24
  )
25
+ callback(msg="OCR finished.")
26
 
27
  from timeit import default_timer as timer
28
  start = timer()
rag/app/naive.py CHANGED
@@ -29,7 +29,7 @@ class Pdf(PdfParser):
29
  to_page,
30
  callback
31
  )
32
- callback("OCR finished")
33
 
34
  from timeit import default_timer as timer
35
  start = timer()
 
29
  to_page,
30
  callback
31
  )
32
+ callback(msg="OCR finished")
33
 
34
  from timeit import default_timer as timer
35
  start = timer()
rag/app/paper.py CHANGED
@@ -36,7 +36,7 @@ class Pdf(PdfParser):
36
  to_page,
37
  callback
38
  )
39
- callback("OCR finished.")
40
 
41
  from timeit import default_timer as timer
42
  start = timer()
 
36
  to_page,
37
  callback
38
  )
39
+ callback(msg="OCR finished.")
40
 
41
  from timeit import default_timer as timer
42
  start = timer()
rag/nlp/search.py CHANGED
@@ -305,8 +305,15 @@ class Dealer:
305
  "similarity": sim[i],
306
  "vector_similarity": vsim[i],
307
  "term_similarity": tsim[i],
308
- "vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim)))
 
309
  }
 
 
 
 
 
 
310
  ranks["chunks"].append(d)
311
  if dnm not in ranks["doc_aggs"]:
312
  ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
 
305
  "similarity": sim[i],
306
  "vector_similarity": vsim[i],
307
  "term_similarity": tsim[i],
308
+ "vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim))),
309
+ "positions": sres.field[id].get("position_int", "").split("\t")
310
  }
311
+ if len(d["positions"]) % 5 == 0:
312
+ poss = []
313
+ for i in range(0, len(d["positions"]), 5):
314
+ poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
315
+ float(d["positions"][i + 3]), float(d["positions"][i + 4])])
316
+ d["positions"] = poss
317
  ranks["chunks"].append(d)
318
  if dnm not in ranks["doc_aggs"]:
319
  ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
rag/svr/task_executor.py CHANGED
@@ -25,6 +25,7 @@ import traceback
25
  from functools import partial
26
  from timeit import default_timer as timer
27
 
 
28
  from elasticsearch_dsl import Q
29
 
30
  from api.db.services.task_service import TaskService
@@ -177,10 +178,11 @@ def embedding(docs, mdl, parser_config={}, callback=None):
177
  tts, c = mdl.encode(tts)
178
  tk_count += c
179
 
180
- cnts_ = []
181
  for i in range(0, len(cnts), 32):
182
  vts, c = mdl.encode(cnts[i: i+32])
183
- cnts_.extend(vts)
 
184
  tk_count += c
185
  callback(msg="")
186
  cnts = cnts_
 
25
  from functools import partial
26
  from timeit import default_timer as timer
27
 
28
+ import numpy as np
29
  from elasticsearch_dsl import Q
30
 
31
  from api.db.services.task_service import TaskService
 
178
  tts, c = mdl.encode(tts)
179
  tk_count += c
180
 
181
+ cnts_ = np.array([])
182
  for i in range(0, len(cnts), 32):
183
  vts, c = mdl.encode(cnts[i: i+32])
184
+ if len(cnts_) == 0: cnts_ = vts
185
+ else: cnts_ = np.concatenate((cnts_, vts), axis=0)
186
  tk_count += c
187
  callback(msg="")
188
  cnts = cnts_