Kevin Hu commited on
Commit
2d7e5db
·
1 Parent(s): db89829

Feat: Add question parameter to edit chunk modal (#3875)

Browse files

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/chunk_app.py CHANGED
@@ -68,6 +68,7 @@ def list_chunk():
68
  "doc_id": sres.field[id]["doc_id"],
69
  "docnm_kwd": sres.field[id]["docnm_kwd"],
70
  "important_kwd": sres.field[id].get("important_kwd", []),
 
71
  "image_id": sres.field[id].get("img_id", ""),
72
  "available_int": sres.field[id].get("available_int", 1),
73
  "positions": json.loads(sres.field[id].get("position_list", "[]")),
@@ -115,7 +116,7 @@ def get():
115
  @manager.route('/set', methods=['POST'])
116
  @login_required
117
  @validate_request("doc_id", "chunk_id", "content_with_weight",
118
- "important_kwd")
119
  def set():
120
  req = request.json
121
  d = {
@@ -125,6 +126,8 @@ def set():
125
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
126
  d["important_kwd"] = req["important_kwd"]
127
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
 
 
128
  if "available_int" in req:
129
  d["available_int"] = req["available_int"]
130
 
@@ -152,7 +155,7 @@ def set():
152
  d = beAdoc(d, arr[0], arr[1], not any(
153
  [rag_tokenizer.is_chinese(t) for t in q + a]))
154
 
155
- v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
156
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
157
  d["q_%d_vec" % len(v)] = v.tolist()
158
  settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
@@ -213,6 +216,8 @@ def create():
213
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
214
  d["important_kwd"] = req.get("important_kwd", [])
215
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
 
 
216
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
217
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
218
 
@@ -237,7 +242,7 @@ def create():
237
  embd_id = DocumentService.get_embd_id(req["doc_id"])
238
  embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
239
 
240
- v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
241
  v = 0.1 * v[0] + 0.9 * v[1]
242
  d["q_%d_vec" % len(v)] = v.tolist()
243
  settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
 
68
  "doc_id": sres.field[id]["doc_id"],
69
  "docnm_kwd": sres.field[id]["docnm_kwd"],
70
  "important_kwd": sres.field[id].get("important_kwd", []),
71
+ "question_kwd": sres.field[id].get("question_kwd", []),
72
  "image_id": sres.field[id].get("img_id", ""),
73
  "available_int": sres.field[id].get("available_int", 1),
74
  "positions": json.loads(sres.field[id].get("position_list", "[]")),
 
116
  @manager.route('/set', methods=['POST'])
117
  @login_required
118
  @validate_request("doc_id", "chunk_id", "content_with_weight",
119
+ "important_kwd", "question_kwd")
120
  def set():
121
  req = request.json
122
  d = {
 
126
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
127
  d["important_kwd"] = req["important_kwd"]
128
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
129
+ d["question_kwd"] = req["question_kwd"]
130
+ d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
131
  if "available_int" in req:
132
  d["available_int"] = req["available_int"]
133
 
 
155
  d = beAdoc(d, arr[0], arr[1], not any(
156
  [rag_tokenizer.is_chinese(t) for t in q + a]))
157
 
158
+ v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
159
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
160
  d["q_%d_vec" % len(v)] = v.tolist()
161
  settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
 
216
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
217
  d["important_kwd"] = req.get("important_kwd", [])
218
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
219
+ d["question_kwd"] = req.get("question_kwd", [])
220
+ d["question_tks"] = rag_tokenizer.tokenize("\n".join(req.get("question_kwd", [])))
221
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
222
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
223
 
 
242
  embd_id = DocumentService.get_embd_id(req["doc_id"])
243
  embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
244
 
245
+ v, c = embd_mdl.encode([doc.name, req["content_with_weight"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
246
  v = 0.1 * v[0] + 0.9 * v[1]
247
  d["q_%d_vec" % len(v)] = v.tolist()
248
  settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
api/apps/sdk/doc.py CHANGED
@@ -844,6 +844,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
844
  "doc_id": sres.field[id]["doc_id"],
845
  "docnm_kwd": sres.field[id]["docnm_kwd"],
846
  "important_kwd": sres.field[id].get("important_kwd", []),
 
847
  "img_id": sres.field[id].get("img_id", ""),
848
  "available_int": sres.field[id].get("available_int", 1),
849
  "positions": sres.field[id].get("position_int", "").split("\t"),
@@ -879,6 +880,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
879
  "content_with_weight": "content",
880
  "doc_id": "document_id",
881
  "important_kwd": "important_keywords",
 
882
  "img_id": "image_id",
883
  "available_int": "available",
884
  }
@@ -978,6 +980,11 @@ def add_chunk(tenant_id, dataset_id, document_id):
978
  return get_error_data_result(
979
  "`important_keywords` is required to be a list"
980
  )
 
 
 
 
 
981
  md5 = hashlib.md5()
982
  md5.update((req["content"] + document_id).encode("utf-8"))
983
 
@@ -992,6 +999,10 @@ def add_chunk(tenant_id, dataset_id, document_id):
992
  d["important_tks"] = rag_tokenizer.tokenize(
993
  " ".join(req.get("important_keywords", []))
994
  )
 
 
 
 
995
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
996
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
997
  d["kb_id"] = dataset_id
@@ -1001,7 +1012,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
1001
  embd_mdl = TenantLLMService.model_instance(
1002
  tenant_id, LLMType.EMBEDDING.value, embd_id
1003
  )
1004
- v, c = embd_mdl.encode([doc.name, req["content"]])
1005
  v = 0.1 * v[0] + 0.9 * v[1]
1006
  d["q_%d_vec" % len(v)] = v.tolist()
1007
  settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
@@ -1013,6 +1024,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
1013
  "content_with_weight": "content",
1014
  "doc_id": "document_id",
1015
  "important_kwd": "important_keywords",
 
1016
  "kb_id": "dataset_id",
1017
  "create_timestamp_flt": "create_timestamp",
1018
  "create_time": "create_time",
@@ -1166,8 +1178,13 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
1166
  if "important_keywords" in req:
1167
  if not isinstance(req["important_keywords"], list):
1168
  return get_error_data_result("`important_keywords` should be a list")
1169
- d["important_kwd"] = req.get("important_keywords")
1170
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
 
 
 
 
 
1171
  if "available" in req:
1172
  d["available_int"] = int(req["available"])
1173
  embd_id = DocumentService.get_embd_id(document_id)
@@ -1185,7 +1202,7 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
1185
  d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
1186
  )
1187
 
1188
- v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
1189
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
1190
  d["q_%d_vec" % len(v)] = v.tolist()
1191
  settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
@@ -1353,6 +1370,7 @@ def retrieval_test(tenant_id):
1353
  "content_with_weight": "content",
1354
  "doc_id": "document_id",
1355
  "important_kwd": "important_keywords",
 
1356
  "docnm_kwd": "document_keyword",
1357
  }
1358
  rename_chunk = {}
 
844
  "doc_id": sres.field[id]["doc_id"],
845
  "docnm_kwd": sres.field[id]["docnm_kwd"],
846
  "important_kwd": sres.field[id].get("important_kwd", []),
847
+ "question_kwd": sres.field[id].get("question_kwd", []),
848
  "img_id": sres.field[id].get("img_id", ""),
849
  "available_int": sres.field[id].get("available_int", 1),
850
  "positions": sres.field[id].get("position_int", "").split("\t"),
 
880
  "content_with_weight": "content",
881
  "doc_id": "document_id",
882
  "important_kwd": "important_keywords",
883
+ "question_kwd": "questions",
884
  "img_id": "image_id",
885
  "available_int": "available",
886
  }
 
980
  return get_error_data_result(
981
  "`important_keywords` is required to be a list"
982
  )
983
+ if "questions" in req:
984
+ if type(req["questions"]) != list:
985
+ return get_error_data_result(
986
+ "`questions` is required to be a list"
987
+ )
988
  md5 = hashlib.md5()
989
  md5.update((req["content"] + document_id).encode("utf-8"))
990
 
 
999
  d["important_tks"] = rag_tokenizer.tokenize(
1000
  " ".join(req.get("important_keywords", []))
1001
  )
1002
+ d["question_kwd"] = req.get("questions", [])
1003
+ d["question_tks"] = rag_tokenizer.tokenize(
1004
+ "\n".join(req.get("questions", []))
1005
+ )
1006
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
1007
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
1008
  d["kb_id"] = dataset_id
 
1012
  embd_mdl = TenantLLMService.model_instance(
1013
  tenant_id, LLMType.EMBEDDING.value, embd_id
1014
  )
1015
+ v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
1016
  v = 0.1 * v[0] + 0.9 * v[1]
1017
  d["q_%d_vec" % len(v)] = v.tolist()
1018
  settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
 
1024
  "content_with_weight": "content",
1025
  "doc_id": "document_id",
1026
  "important_kwd": "important_keywords",
1027
+ "question_kwd": "questions",
1028
  "kb_id": "dataset_id",
1029
  "create_timestamp_flt": "create_timestamp",
1030
  "create_time": "create_time",
 
1178
  if "important_keywords" in req:
1179
  if not isinstance(req["important_keywords"], list):
1180
  return get_error_data_result("`important_keywords` should be a list")
1181
+ d["important_kwd"] = req.get("important_keywords", [])
1182
  d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
1183
+ if "questions" in req:
1184
+ if not isinstance(req["questions"], list):
1185
+ return get_error_data_result("`questions` should be a list")
1186
+ d["question_kwd"] = req.get("questions")
1187
+ d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"]))
1188
  if "available" in req:
1189
  d["available_int"] = int(req["available"])
1190
  embd_id = DocumentService.get_embd_id(document_id)
 
1202
  d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
1203
  )
1204
 
1205
+ v, c = embd_mdl.encode([doc.name, d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
1206
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
1207
  d["q_%d_vec" % len(v)] = v.tolist()
1208
  settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
 
1370
  "content_with_weight": "content",
1371
  "doc_id": "document_id",
1372
  "important_kwd": "important_keywords",
1373
+ "question_kwd": "questions",
1374
  "docnm_kwd": "document_keyword",
1375
  }
1376
  rename_chunk = {}
conf/infinity_mapping.json CHANGED
@@ -11,6 +11,8 @@
11
  "name_kwd": {"type": "varchar", "default": ""},
12
  "important_kwd": {"type": "varchar", "default": ""},
13
  "important_tks": {"type": "varchar", "default": ""},
 
 
14
  "content_with_weight": {"type": "varchar", "default": ""},
15
  "content_ltks": {"type": "varchar", "default": ""},
16
  "content_sm_ltks": {"type": "varchar", "default": ""},
 
11
  "name_kwd": {"type": "varchar", "default": ""},
12
  "important_kwd": {"type": "varchar", "default": ""},
13
  "important_tks": {"type": "varchar", "default": ""},
14
+ "question_kwd": {"type": "varchar", "default": ""},
15
+ "question_tks": {"type": "varchar", "default": ""},
16
  "content_with_weight": {"type": "varchar", "default": ""},
17
  "content_ltks": {"type": "varchar", "default": ""},
18
  "content_sm_ltks": {"type": "varchar", "default": ""},
rag/nlp/query.py CHANGED
@@ -31,6 +31,7 @@ class FulltextQueryer:
31
  "title_sm_tks^5",
32
  "important_kwd^30",
33
  "important_tks^20",
 
34
  "content_ltks^2",
35
  "content_sm_ltks",
36
  ]
 
31
  "title_sm_tks^5",
32
  "important_kwd^30",
33
  "important_tks^20",
34
+ "question_tks^20",
35
  "content_ltks^2",
36
  "content_sm_ltks",
37
  ]
rag/nlp/search.py CHANGED
@@ -74,7 +74,7 @@ class Dealer:
74
  offset, limit = pg * ps, (pg + 1) * ps
75
 
76
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
77
- "doc_id", "position_list", "knowledge_graph_kwd",
78
  "available_int", "content_with_weight", "pagerank_fea"])
79
  kwds = set([])
80
 
@@ -251,8 +251,9 @@ class Dealer:
251
  for i in sres.ids:
252
  content_ltks = sres.field[i][cfield].split()
253
  title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
 
254
  important_kwd = sres.field[i].get("important_kwd", [])
255
- tks = content_ltks + title_tks*2 + important_kwd*5
256
  ins_tw.append(tks)
257
 
258
  sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
@@ -322,11 +323,14 @@ class Dealer:
322
  sim = tsim = vsim = [1]*len(sres.ids)
323
  idx = list(range(len(sres.ids)))
324
 
 
 
 
325
  dim = len(sres.query_vector)
326
  vector_column = f"q_{dim}_vec"
327
  zero_vector = [0.0] * dim
328
  for i in idx:
329
- if sim[i] < similarity_threshold:
330
  break
331
  if len(ranks["chunks"]) >= page_size:
332
  if aggs:
@@ -337,8 +341,6 @@ class Dealer:
337
  dnm = chunk["docnm_kwd"]
338
  did = chunk["doc_id"]
339
  position_list = chunk.get("position_list", "[]")
340
- if not position_list:
341
- position_list = "[]"
342
  d = {
343
  "chunk_id": id,
344
  "content_ltks": chunk["content_ltks"],
 
74
  offset, limit = pg * ps, (pg + 1) * ps
75
 
76
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
77
+ "doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks",
78
  "available_int", "content_with_weight", "pagerank_fea"])
79
  kwds = set([])
80
 
 
251
  for i in sres.ids:
252
  content_ltks = sres.field[i][cfield].split()
253
  title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
254
+ question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
255
  important_kwd = sres.field[i].get("important_kwd", [])
256
+ tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6
257
  ins_tw.append(tks)
258
 
259
  sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
 
323
  sim = tsim = vsim = [1]*len(sres.ids)
324
  idx = list(range(len(sres.ids)))
325
 
326
+ def floor_sim(score):
327
+ return (int(score * 100.)%100)/100.
328
+
329
  dim = len(sres.query_vector)
330
  vector_column = f"q_{dim}_vec"
331
  zero_vector = [0.0] * dim
332
  for i in idx:
333
+ if floor_sim(sim[i]) < similarity_threshold:
334
  break
335
  if len(ranks["chunks"]) >= page_size:
336
  if aggs:
 
341
  dnm = chunk["docnm_kwd"]
342
  did = chunk["doc_id"]
343
  position_list = chunk.get("position_list", "[]")
 
 
344
  d = {
345
  "chunk_id": id,
346
  "content_ltks": chunk["content_ltks"],
rag/svr/task_executor.py CHANGED
@@ -255,13 +255,8 @@ def build_chunks(task, progress_callback):
255
  progress_callback(msg="Start to generate questions for every chunk ...")
256
  chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
257
  for d in docs:
258
- qst = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"])
259
- d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"]
260
- qst = rag_tokenizer.tokenize(qst)
261
- if "content_ltks" in d:
262
- d["content_ltks"] += " " + qst
263
- if "content_sm_ltks" in d:
264
- d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
265
  progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st))
266
 
267
  return docs
@@ -275,9 +270,16 @@ def init_kb(row, vector_size: int):
275
  def embedding(docs, mdl, parser_config=None, callback=None):
276
  if parser_config is None:
277
  parser_config = {}
278
- batch_size = 32
279
- tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
280
- re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
 
 
 
 
 
 
 
281
  tk_count = 0
282
  if len(tts) == len(cnts):
283
  tts_ = np.array([])
 
255
  progress_callback(msg="Start to generate questions for every chunk ...")
256
  chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
257
  for d in docs:
258
+ d["question_kwd"] = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]).split("\n")
259
+ d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
 
 
 
 
 
260
  progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st))
261
 
262
  return docs
 
270
  def embedding(docs, mdl, parser_config=None, callback=None):
271
  if parser_config is None:
272
  parser_config = {}
273
+ batch_size = 16
274
+ tts, cnts = [], []
275
+ for d in docs:
276
+ tts.append(rmSpace(d["title_tks"]))
277
+ c = "\n".join(d.get("question_kwd", []))
278
+ if not c:
279
+ c = d["content_with_weight"]
280
+ c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
281
+ cnts.append(c)
282
+
283
  tk_count = 0
284
  if len(tts) == len(cnts):
285
  tts_ = np.array([])
sdk/python/ragflow_sdk/modules/chunk.py CHANGED
@@ -6,6 +6,7 @@ class Chunk(Base):
6
  self.id = ""
7
  self.content = ""
8
  self.important_keywords = []
 
9
  self.create_time = ""
10
  self.create_timestamp = 0.0
11
  self.dataset_id = None
 
6
  self.id = ""
7
  self.content = ""
8
  self.important_keywords = []
9
+ self.questions = []
10
  self.create_time = ""
11
  self.create_timestamp = 0.0
12
  self.dataset_id = None
sdk/python/ragflow_sdk/modules/document.py CHANGED
@@ -61,9 +61,9 @@ class Document(Base):
61
  return chunks
62
  raise Exception(res.get("message"))
63
 
64
-
65
- def add_chunk(self, content: str,important_keywords: list[str] = []):
66
- res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks', {"content":content,"important_keywords":important_keywords})
67
  res = res.json()
68
  if res.get("code") == 0:
69
  return Chunk(self.rag,res["data"].get("chunk"))
 
61
  return chunks
62
  raise Exception(res.get("message"))
63
 
64
+ def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = []):
65
+ res = self.post(f'/datasets/{self.dataset_id}/documents/{self.id}/chunks',
66
+ {"content":content,"important_keywords":important_keywords, "questions": questions})
67
  res = res.json()
68
  if res.get("code") == 0:
69
  return Chunk(self.rag,res["data"].get("chunk"))