Kevin Hu commited on
Commit
c60fd19
·
1 Parent(s): 3b7343c

remove duplicated key in mind map (#1809)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (2) hide show
  1. graphrag/index.py +10 -5
  2. rag/nlp/__init__.py +1 -0
graphrag/index.py CHANGED
@@ -29,14 +29,15 @@ from rag.nlp import rag_tokenizer
29
  from rag.utils import num_tokens_from_string
30
 
31
 
32
- def be_children(obj: dict):
33
  arr = []
34
  for k,v in obj.items():
35
  k = re.sub(r"\*+", "", k)
36
- if not k :continue
 
37
  arr.append({
38
  "id": k,
39
- "children": be_children(v) if isinstance(v, dict) else []
40
  })
41
  return arr
42
 
@@ -142,8 +143,12 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
142
  mg = mindmap(_chunks).output
143
  if not len(mg.keys()): return chunks
144
 
145
- if len(mg.keys()) > 1: md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
146
- else: md_map = {"id": re.sub(r"\*+", "", list(mg.keys())[0]), "children": be_children(list(mg.items())[1])}
 
 
 
 
147
  print(json.dumps(md_map, ensure_ascii=False, indent=2))
148
  chunks.append(
149
  {
 
29
  from rag.utils import num_tokens_from_string
30
 
31
 
32
+ def be_children(obj: dict, keyset:set):
33
  arr = []
34
  for k,v in obj.items():
35
  k = re.sub(r"\*+", "", k)
36
+ if not k or k in keyset:continue
37
+ keyset.add(k)
38
  arr.append({
39
  "id": k,
40
+ "children": be_children(v, keyset) if isinstance(v, dict) else []
41
  })
42
  return arr
43
 
 
143
  mg = mindmap(_chunks).output
144
  if not len(mg.keys()): return chunks
145
 
146
+ if len(mg.keys()) > 1:
147
+ keyset = set([re.sub(r"\*+", "", k) for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
148
+ md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v, keyset)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
149
+ else:
150
+ k = re.sub(r"\*+", "", list(mg.keys())[0])
151
+ md_map = {"id": k, "children": be_children(list(mg.items())[0][1], set([k]))}
152
  print(json.dumps(md_map, ensure_ascii=False, indent=2))
153
  chunks.append(
154
  {
rag/nlp/__init__.py CHANGED
@@ -483,6 +483,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
483
  def add_chunk(t, pos):
484
  nonlocal cks, tk_nums, delimiter
485
  tnum = num_tokens_from_string(t)
 
486
  if tnum < 8:
487
  pos = ""
488
  # Ensure that the length of the merged chunk does not exceed chunk_token_num
 
483
  def add_chunk(t, pos):
484
  nonlocal cks, tk_nums, delimiter
485
  tnum = num_tokens_from_string(t)
486
+ if not pos: pos = ""
487
  if tnum < 8:
488
  pos = ""
489
  # Ensure that the length of the merged chunk does not exceed chunk_token_num