H commited on
Commit
73c78d3
·
1 Parent(s): 349437c

Fix graphrag callback (#1806)

Browse files

### What problem does this PR solve?

#1800

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (2) hide show
  1. graphrag/index.py +2 -2
  2. rag/app/knowledge_graph.py +2 -2
graphrag/index.py CHANGED
@@ -45,7 +45,7 @@ def graph_merge(g1, g2):
45
  g = g2.copy()
46
  for n, attr in g1.nodes(data=True):
47
  if n not in g2.nodes():
48
- g2.add_node(n, **attr)
49
  continue
50
 
51
  g.nodes[n]["weight"] += 1
@@ -75,7 +75,7 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
75
  cnt = 0
76
  threads = []
77
  exe = ThreadPoolExecutor(max_workers=12)
78
- for i in range(len(chunks[:512])):
79
  tkn_cnt = num_tokens_from_string(chunks[i])
80
  if cnt+tkn_cnt >= left_token_count and texts:
81
  threads.append(exe.submit(ext, texts, {"entity_types": entity_types}))
 
45
  g = g2.copy()
46
  for n, attr in g1.nodes(data=True):
47
  if n not in g2.nodes():
48
+ g.add_node(n, **attr)
49
  continue
50
 
51
  g.nodes[n]["weight"] += 1
 
75
  cnt = 0
76
  threads = []
77
  exe = ThreadPoolExecutor(max_workers=12)
78
+ for i in range(len(chunks)):
79
  tkn_cnt = num_tokens_from_string(chunks[i])
80
  if cnt+tkn_cnt >= left_token_count and texts:
81
  threads.append(exe.submit(ext, texts, {"entity_types": entity_types}))
rag/app/knowledge_graph.py CHANGED
@@ -13,7 +13,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
13
  eng = lang.lower() == "english"
14
 
15
  parser_config["layout_recognize"] = False
16
- sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, parser_config=parser_config)
17
  chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
18
  parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
19
  )
@@ -27,4 +27,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
27
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
28
  chunks.extend(tokenize_chunks(sections, doc, eng))
29
 
30
- return chunks
 
13
  eng = lang.lower() == "english"
14
 
15
  parser_config["layout_recognize"] = False
16
+ sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config)
17
  chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
18
  parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
19
  )
 
27
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
28
  chunks.extend(tokenize_chunks(sections, doc, eng))
29
 
30
+ return chunks