Kevin Hu
commited on
Commit
·
69ced1e
1
Parent(s):
168c8d9
Fix chunk number error after re-parsing. (#4043)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
api/apps/document_app.py
CHANGED
@@ -356,12 +356,11 @@ def run():
|
|
356 |
try:
|
357 |
for id in req["doc_ids"]:
|
358 |
info = {"run": str(req["run"]), "progress": 0}
|
359 |
-
if str(req["run"]) == TaskStatus.RUNNING.value:
|
360 |
info["progress_msg"] = ""
|
361 |
info["chunk_num"] = 0
|
362 |
info["token_num"] = 0
|
363 |
DocumentService.update_by_id(id, info)
|
364 |
-
# if str(req["run"]) == TaskStatus.CANCEL.value:
|
365 |
tenant_id = DocumentService.get_tenant_id(id)
|
366 |
if not tenant_id:
|
367 |
return get_data_error_result(message="Tenant not found!")
|
|
|
356 |
try:
|
357 |
for id in req["doc_ids"]:
|
358 |
info = {"run": str(req["run"]), "progress": 0}
|
359 |
+
if str(req["run"]) == TaskStatus.RUNNING.value and req.get("delete", False):
|
360 |
info["progress_msg"] = ""
|
361 |
info["chunk_num"] = 0
|
362 |
info["token_num"] = 0
|
363 |
DocumentService.update_by_id(id, info)
|
|
|
364 |
tenant_id = DocumentService.get_tenant_id(id)
|
365 |
if not tenant_id:
|
366 |
return get_data_error_result(message="Tenant not found!")
|
api/db/services/task_service.py
CHANGED
@@ -248,8 +248,9 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|
248 |
|
249 |
prev_tasks = TaskService.get_tasks(doc["id"])
|
250 |
if prev_tasks:
|
|
|
251 |
for task in tsks:
|
252 |
-
reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
253 |
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
254 |
chunk_ids = []
|
255 |
for task in prev_tasks:
|
@@ -257,6 +258,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|
257 |
chunk_ids.extend(task["chunk_ids"].split())
|
258 |
if chunk_ids:
|
259 |
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
|
|
260 |
|
261 |
bulk_insert_into_db(Task, tsks, True)
|
262 |
DocumentService.begin2parse(doc["id"])
|
@@ -267,14 +269,17 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|
267 |
SVR_QUEUE_NAME, message=t
|
268 |
), "Can't access Redis. Please check the Redis' status."
|
269 |
|
|
|
270 |
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
|
271 |
idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
|
272 |
if idx >= len(prev_tasks):
|
273 |
-
return
|
274 |
prev_task = prev_tasks[idx]
|
275 |
if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
|
276 |
-
return
|
277 |
task["chunk_ids"] = prev_task["chunk_ids"]
|
278 |
task["progress"] = 1.0
|
279 |
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
|
280 |
prev_task["chunk_ids"] = ""
|
|
|
|
|
|
248 |
|
249 |
prev_tasks = TaskService.get_tasks(doc["id"])
|
250 |
if prev_tasks:
|
251 |
+
ck_num = 0
|
252 |
for task in tsks:
|
253 |
+
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
254 |
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
255 |
chunk_ids = []
|
256 |
for task in prev_tasks:
|
|
|
258 |
chunk_ids.extend(task["chunk_ids"].split())
|
259 |
if chunk_ids:
|
260 |
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
261 |
+
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
|
262 |
|
263 |
bulk_insert_into_db(Task, tsks, True)
|
264 |
DocumentService.begin2parse(doc["id"])
|
|
|
269 |
SVR_QUEUE_NAME, message=t
|
270 |
), "Can't access Redis. Please check the Redis' status."
|
271 |
|
272 |
+
|
273 |
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
|
274 |
idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
|
275 |
if idx >= len(prev_tasks):
|
276 |
+
return 0
|
277 |
prev_task = prev_tasks[idx]
|
278 |
if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
|
279 |
+
return 0
|
280 |
task["chunk_ids"] = prev_task["chunk_ids"]
|
281 |
task["progress"] = 1.0
|
282 |
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
|
283 |
prev_task["chunk_ids"] = ""
|
284 |
+
|
285 |
+
return len(task["chunk_ids"].split())
|