Kevin Hu commited on
Commit
a92e785
·
1 Parent(s): 447446d

refactor auto keywords and auto question (#2990)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Refactoring

Files changed (1) hide show
  1. rag/svr/task_executor.py +20 -17
rag/svr/task_executor.py CHANGED
@@ -199,23 +199,6 @@ def build(row):
199
  d["_id"] = md5.hexdigest()
200
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
201
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
202
-
203
- if row["parser_config"].get("auto_keywords", 0):
204
- chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
205
- d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
206
- row["parser_config"]["auto_keywords"]).split(",")
207
- d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
208
-
209
- if row["parser_config"].get("auto_questions", 0):
210
- chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
211
- qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
212
- ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
213
- qst = rag_tokenizer.tokenize(qst)
214
- if "content_ltks" in ck:
215
- ck["content_ltks"] += " " + qst
216
- if "content_sm_ltks" in ck:
217
- ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
218
-
219
  if not d.get("image"):
220
  docs.append(d)
221
  continue
@@ -239,6 +222,26 @@ def build(row):
239
  docs.append(d)
240
  cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  return docs
243
 
244
 
 
199
  d["_id"] = md5.hexdigest()
200
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
201
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  if not d.get("image"):
203
  docs.append(d)
204
  continue
 
222
  docs.append(d)
223
  cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
224
 
225
+ if row["parser_config"].get("auto_keywords", 0):
226
+ callback(msg="Start to generate keywords for every chunk ...")
227
+ chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
228
+ for d in docs:
229
+ d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
230
+ row["parser_config"]["auto_keywords"]).split(",")
231
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
232
+
233
+ if row["parser_config"].get("auto_questions", 0):
234
+ callback(msg="Start to generate questions for every chunk ...")
235
+ chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
236
+ for d in docs:
237
+ qst = question_proposal(chat_mdl, d["content_with_weight"], row["parser_config"]["auto_questions"])
238
+ d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"]
239
+ qst = rag_tokenizer.tokenize(qst)
240
+ if "content_ltks" in d:
241
+ d["content_ltks"] += " " + qst
242
+ if "content_sm_ltks" in d:
243
+ d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
244
+
245
  return docs
246
 
247