liuhua liuhua commited on
Commit
3a77303
·
1 Parent(s): 9cfd69b

Fix bugs in chunk api (#4293)

Browse files

### What problem does this PR solve?

Fix bugs in chunk api #4149

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: liuhua <[email protected]>

api/apps/chunk_app.py CHANGED
@@ -220,7 +220,7 @@ def create():
220
  e, doc = DocumentService.get_by_id(req["doc_id"])
221
  if not e:
222
  return get_data_error_result(message="Document not found!")
223
- d["kb_id"] = [doc.kb_id]
224
  d["docnm_kwd"] = doc.name
225
  d["title_tks"] = rag_tokenizer.tokenize(doc.name)
226
  d["doc_id"] = doc.id
 
220
  e, doc = DocumentService.get_by_id(req["doc_id"])
221
  if not e:
222
  return get_data_error_result(message="Document not found!")
223
+ d["kb_id"] = doc.kb_id
224
  d["docnm_kwd"] = doc.name
225
  d["title_tks"] = rag_tokenizer.tokenize(doc.name)
226
  d["doc_id"] = doc.id
api/apps/sdk/doc.py CHANGED
@@ -847,59 +847,55 @@ def list_chunks(tenant_id, dataset_id, document_id):
847
  renamed_doc["run"] = run_mapping.get(str(value))
848
 
849
  res = {"total": 0, "chunks": [], "doc": renamed_doc}
850
- origin_chunks = []
851
- if settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
  sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
853
  highlight=True)
854
  res["total"] = sres.total
855
- sign = 0
856
  for id in sres.ids:
857
  d = {
858
  "id": id,
859
- "content_with_weight": (
860
  rmSpace(sres.highlight[id])
861
  if question and id in sres.highlight
862
  else sres.field[id].get("content_with_weight", "")
863
  ),
864
- "doc_id": sres.field[id]["doc_id"],
865
  "docnm_kwd": sres.field[id]["docnm_kwd"],
866
- "important_kwd": sres.field[id].get("important_kwd", []),
867
- "question_kwd": sres.field[id].get("question_kwd", []),
868
- "img_id": sres.field[id].get("img_id", ""),
869
- "available_int": sres.field[id].get("available_int", 1),
870
- "positions": sres.field[id].get("position_int", []),
 
871
  }
872
- origin_chunks.append(d)
873
- if req.get("id"):
874
- if req.get("id") == id:
875
- origin_chunks.clear()
876
- origin_chunks.append(d)
877
- sign = 1
878
- break
879
- if req.get("id"):
880
- if sign == 0:
881
- return get_error_data_result(f"Can't find this chunk {req.get('id')}")
882
-
883
- for chunk in origin_chunks:
884
- key_mapping = {
885
- "id": "id",
886
- "content_with_weight": "content",
887
- "doc_id": "document_id",
888
- "important_kwd": "important_keywords",
889
- "question_kwd": "questions",
890
- "img_id": "image_id",
891
- "available_int": "available",
892
- }
893
- renamed_chunk = {}
894
- for key, value in chunk.items():
895
- new_key = key_mapping.get(key, key)
896
- renamed_chunk[new_key] = value
897
- if renamed_chunk["available"] == 0:
898
- renamed_chunk["available"] = False
899
- if renamed_chunk["available"] == 1:
900
- renamed_chunk["available"] = True
901
- res["chunks"].append(renamed_chunk)
902
- _ = Chunk(**renamed_chunk) # validate the chunk
903
  return get_result(data=res)
904
 
905
 
@@ -1377,6 +1373,7 @@ def retrieval_test(tenant_id):
1377
  "important_kwd": "important_keywords",
1378
  "question_kwd": "questions",
1379
  "docnm_kwd": "document_keyword",
 
1380
  }
1381
  rename_chunk = {}
1382
  for key, value in chunk.items():
 
847
  renamed_doc["run"] = run_mapping.get(str(value))
848
 
849
  res = {"total": 0, "chunks": [], "doc": renamed_doc}
850
+ if req.get("id"):
851
+ chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id])
852
+ k = []
853
+ for n in chunk.keys():
854
+ if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
855
+ k.append(n)
856
+ for n in k:
857
+ del chunk[n]
858
+ if not chunk:
859
+ return get_error_data_result(f"Chunk `{req.get('id')}` not found.")
860
+ res['total'] = 1
861
+ final_chunk = {
862
+ "id":chunk.get("id",chunk.get("chunk_id")),
863
+ "content":chunk["content_with_weight"],
864
+ "document_id":chunk.get("doc_id",chunk.get("document_id")),
865
+ "docnm_kwd":chunk["docnm_kwd"],
866
+ "important_keywords":chunk.get("important_kwd",[]),
867
+ "questions":chunk.get("question_kwd",[]),
868
+ "dataset_id":chunk.get("kb_id",chunk.get("dataset_id")),
869
+ "image_id":chunk["img_id"],
870
+ "available":bool(chunk.get("available_int",1)),
871
+ "positions":chunk.get("position_int",[]),
872
+ }
873
+ res["chunks"].append(final_chunk)
874
+ _ = Chunk(**final_chunk)
875
+
876
+ elif settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
877
  sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
878
  highlight=True)
879
  res["total"] = sres.total
 
880
  for id in sres.ids:
881
  d = {
882
  "id": id,
883
+ "content": (
884
  rmSpace(sres.highlight[id])
885
  if question and id in sres.highlight
886
  else sres.field[id].get("content_with_weight", "")
887
  ),
888
+ "document_id": sres.field[id]["doc_id"],
889
  "docnm_kwd": sres.field[id]["docnm_kwd"],
890
+ "important_keywords": sres.field[id].get("important_kwd", []),
891
+ "questions": sres.field[id].get("question_kwd", []),
892
+ "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
893
+ "image_id": sres.field[id].get("img_id", ""),
894
+ "available": bool(sres.field[id].get("available_int", 1)),
895
+ "positions": sres.field[id].get("position_int",[]),
896
  }
897
+ res["chunks"].append(d)
898
+ _ = Chunk(**d) # validate the chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899
  return get_result(data=res)
900
 
901
 
 
1373
  "important_kwd": "important_keywords",
1374
  "question_kwd": "questions",
1375
  "docnm_kwd": "document_keyword",
1376
+ "kb_id":"dataset_id"
1377
  }
1378
  rename_chunk = {}
1379
  for key, value in chunk.items():
docs/references/http_api_reference.md CHANGED
@@ -927,7 +927,8 @@ curl --request POST \
927
  The text content of the chunk.
928
  - `"important_keywords`(*Body parameter*), `list[string]`
929
  The key terms or phrases to tag with the chunk.
930
-
 
931
  #### Response
932
 
933
  Success:
@@ -937,13 +938,14 @@ Success:
937
  "code": 0,
938
  "data": {
939
  "chunk": {
940
- "content": "ragflow content",
941
- "create_time": "2024-10-16 08:05:04",
942
- "create_timestamp": 1729065904.581025,
943
- "dataset_id": "c7ee74067a2c11efb21c0242ac120006",
944
- "document_id": "5c5999ec7be811ef9cab0242ac120005",
945
- "id": "d78435d142bd5cf6704da62c778795c5",
946
- "important_keywords": []
 
947
  }
948
  }
949
  }
 
927
  The text content of the chunk.
928
  - `"important_keywords`(*Body parameter*), `list[string]`
929
  The key terms or phrases to tag with the chunk.
930
+ - `"questions"`(*Body parameter*), `list[string]`
931
+ If there is a given question, the embedded chunks will be based on them
932
  #### Response
933
 
934
  Success:
 
938
  "code": 0,
939
  "data": {
940
  "chunk": {
941
+ "content": "who are you",
942
+ "create_time": "2024-12-30 16:59:55",
943
+ "create_timestamp": 1735549195.969164,
944
+ "dataset_id": "72f36e1ebdf411efb7250242ac120006",
945
+ "document_id": "61d68474be0111ef98dd0242ac120006",
946
+ "id": "12ccdc56e59837e5",
947
+ "important_keywords": [],
948
+ "questions": []
949
  }
950
  }
951
  }