Kevin Hu
commited on
Commit
·
c60fd19
1
Parent(s):
3b7343c
remove duplicated key in mind map (#1809)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- graphrag/index.py +10 -5
- rag/nlp/__init__.py +1 -0
graphrag/index.py
CHANGED
|
@@ -29,14 +29,15 @@ from rag.nlp import rag_tokenizer
|
|
| 29 |
from rag.utils import num_tokens_from_string
|
| 30 |
|
| 31 |
|
| 32 |
-
def be_children(obj: dict):
|
| 33 |
arr = []
|
| 34 |
for k,v in obj.items():
|
| 35 |
k = re.sub(r"\*+", "", k)
|
| 36 |
-
if not k :continue
|
|
|
|
| 37 |
arr.append({
|
| 38 |
"id": k,
|
| 39 |
-
"children": be_children(v) if isinstance(v, dict) else []
|
| 40 |
})
|
| 41 |
return arr
|
| 42 |
|
|
@@ -142,8 +143,12 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
|
|
| 142 |
mg = mindmap(_chunks).output
|
| 143 |
if not len(mg.keys()): return chunks
|
| 144 |
|
| 145 |
-
if len(mg.keys()) > 1:
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
print(json.dumps(md_map, ensure_ascii=False, indent=2))
|
| 148 |
chunks.append(
|
| 149 |
{
|
|
|
|
| 29 |
from rag.utils import num_tokens_from_string
|
| 30 |
|
| 31 |
|
| 32 |
+
def be_children(obj: dict, keyset:set):
|
| 33 |
arr = []
|
| 34 |
for k,v in obj.items():
|
| 35 |
k = re.sub(r"\*+", "", k)
|
| 36 |
+
if not k or k in keyset:continue
|
| 37 |
+
keyset.add(k)
|
| 38 |
arr.append({
|
| 39 |
"id": k,
|
| 40 |
+
"children": be_children(v, keyset) if isinstance(v, dict) else []
|
| 41 |
})
|
| 42 |
return arr
|
| 43 |
|
|
|
|
| 143 |
mg = mindmap(_chunks).output
|
| 144 |
if not len(mg.keys()): return chunks
|
| 145 |
|
| 146 |
+
if len(mg.keys()) > 1:
|
| 147 |
+
keyset = set([re.sub(r"\*+", "", k) for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
|
| 148 |
+
md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v, keyset)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
|
| 149 |
+
else:
|
| 150 |
+
k = re.sub(r"\*+", "", list(mg.keys())[0])
|
| 151 |
+
md_map = {"id": k, "children": be_children(list(mg.items())[0][1], set([k]))}
|
| 152 |
print(json.dumps(md_map, ensure_ascii=False, indent=2))
|
| 153 |
chunks.append(
|
| 154 |
{
|
rag/nlp/__init__.py
CHANGED
|
@@ -483,6 +483,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
| 483 |
def add_chunk(t, pos):
|
| 484 |
nonlocal cks, tk_nums, delimiter
|
| 485 |
tnum = num_tokens_from_string(t)
|
|
|
|
| 486 |
if tnum < 8:
|
| 487 |
pos = ""
|
| 488 |
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
|
|
|
| 483 |
def add_chunk(t, pos):
|
| 484 |
nonlocal cks, tk_nums, delimiter
|
| 485 |
tnum = num_tokens_from_string(t)
|
| 486 |
+
if not pos: pos = ""
|
| 487 |
if tnum < 8:
|
| 488 |
pos = ""
|
| 489 |
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|