Kevin Hu commited on
Commit
c337e13
·
1 Parent(s): d7fa9e2

Updates on parsing progress, including more detailed time cost inform… (#3402)

Browse files

### What problem does this PR solve?

#3401

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (2) hide show
  1. api/validation.py +14 -3
  2. rag/svr/task_executor.py +8 -3
api/validation.py CHANGED
@@ -32,7 +32,18 @@ def python_version_validation():
32
 
33
  python_version_validation()
34
 
 
35
  # Download nltk data
36
- import nltk
37
- nltk.download('wordnet', halt_on_error=False, quiet=True)
38
- nltk.download('punkt_tab', halt_on_error=False, quiet=True)
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  python_version_validation()
34
 
35
+
36
  # Download nltk data
37
+ def download_nltk_data():
38
+ import nltk
39
+ nltk.download('wordnet', halt_on_error=False, quiet=True)
40
+ nltk.download('punkt_tab', halt_on_error=False, quiet=True)
41
+
42
+
43
+ try:
44
+ from multiprocessing import Pool
45
+ pool = Pool(processes=1)
46
+ thr = pool.apply_async(download_nltk_data)
47
+ binary = thr.get(timeout=60)
48
+ except Exception as e:
49
+ print('\x1b[6;37;41m WARNING \x1b[0m' + "Downloading NLTK data failure.", flush=True)
rag/svr/task_executor.py CHANGED
@@ -218,14 +218,17 @@ def build(row):
218
  logger.info("MINIO PUT({}):{}".format(row["name"], el))
219
 
220
  if row["parser_config"].get("auto_keywords", 0):
 
221
  callback(msg="Start to generate keywords for every chunk ...")
222
  chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
223
  for d in docs:
224
  d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
225
  row["parser_config"]["auto_keywords"]).split(",")
226
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
 
227
 
228
  if row["parser_config"].get("auto_questions", 0):
 
229
  callback(msg="Start to generate questions for every chunk ...")
230
  chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
231
  for d in docs:
@@ -236,6 +239,7 @@ def build(row):
236
  d["content_ltks"] += " " + qst
237
  if "content_sm_ltks" in d:
238
  d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
 
239
 
240
  return docs
241
 
@@ -364,8 +368,8 @@ def main():
364
  # TODO: exception handler
365
  ## set_progress(r["did"], -1, "ERROR: ")
366
  callback(
367
- msg="Finished slicing files(%d). Start to embedding the content." %
368
- len(cks))
369
  st = timer()
370
  try:
371
  tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
@@ -374,7 +378,7 @@ def main():
374
  logger.exception("run_rembedding got exception")
375
  tk_count = 0
376
  logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
377
- callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
378
 
379
  # logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
380
  init_kb(r, vector_size)
@@ -396,6 +400,7 @@ def main():
396
  if TaskService.do_cancel(r["id"]):
397
  docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
398
  continue
 
399
  callback(1., "Done!")
400
  DocumentService.increment_chunk_num(
401
  r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
 
218
  logger.info("MINIO PUT({}):{}".format(row["name"], el))
219
 
220
  if row["parser_config"].get("auto_keywords", 0):
221
+ st = timer()
222
  callback(msg="Start to generate keywords for every chunk ...")
223
  chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
224
  for d in docs:
225
  d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
226
  row["parser_config"]["auto_keywords"]).split(",")
227
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
228
+ callback(msg="Keywords generation completed in {:.2f}s".format(timer()-st))
229
 
230
  if row["parser_config"].get("auto_questions", 0):
231
+ st = timer()
232
  callback(msg="Start to generate questions for every chunk ...")
233
  chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
234
  for d in docs:
 
239
  d["content_ltks"] += " " + qst
240
  if "content_sm_ltks" in d:
241
  d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
242
+ callback(msg="Question generation completed in {:.2f}s".format(timer()-st))
243
 
244
  return docs
245
 
 
368
  # TODO: exception handler
369
  ## set_progress(r["did"], -1, "ERROR: ")
370
  callback(
371
+ msg="Finished slicing files ({} chunks in {:.2f}s). Start to embedding the content.".format(len(cks), timer() - st)
372
+ )
373
  st = timer()
374
  try:
375
  tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
 
378
  logger.exception("run_rembedding got exception")
379
  tk_count = 0
380
  logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
381
+ callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st))
382
 
383
  # logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
384
  init_kb(r, vector_size)
 
400
  if TaskService.do_cancel(r["id"]):
401
  docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
402
  continue
403
+ callback(msg="Indexing elapsed in {:.2f}s.".format(timer() - st))
404
  callback(1., "Done!")
405
  DocumentService.increment_chunk_num(
406
  r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)