Kevin Hu
commited on
Commit
·
b6ce919
1
Parent(s):
c530565
refine mindmap (#1817)
Browse files### What problem does this PR solve?
#1594
### Type of change
- [x] Refactoring
- graphrag/index.py +8 -1
- graphrag/mind_map_extractor.py +20 -4
- graphrag/mind_map_prompt.py +11 -19
graphrag/index.py
CHANGED
|
@@ -21,6 +21,7 @@ from typing import List
|
|
| 21 |
import networkx as nx
|
| 22 |
from api.db import LLMType
|
| 23 |
from api.db.services.llm_service import LLMBundle
|
|
|
|
| 24 |
from graphrag.community_reports_extractor import CommunityReportsExtractor
|
| 25 |
from graphrag.entity_resolution import EntityResolution
|
| 26 |
from graphrag.graph_extractor import GraphExtractor
|
|
@@ -30,6 +31,11 @@ from rag.utils import num_tokens_from_string
|
|
| 30 |
|
| 31 |
|
| 32 |
def be_children(obj: dict, keyset:set):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
arr = []
|
| 34 |
for k,v in obj.items():
|
| 35 |
k = re.sub(r"\*+", "", k)
|
|
@@ -65,7 +71,8 @@ def graph_merge(g1, g2):
|
|
| 65 |
|
| 66 |
|
| 67 |
def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
|
| 68 |
-
|
|
|
|
| 69 |
ext = GraphExtractor(llm_bdl)
|
| 70 |
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
|
| 71 |
left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
|
|
|
|
| 21 |
import networkx as nx
|
| 22 |
from api.db import LLMType
|
| 23 |
from api.db.services.llm_service import LLMBundle
|
| 24 |
+
from api.db.services.user_service import TenantService
|
| 25 |
from graphrag.community_reports_extractor import CommunityReportsExtractor
|
| 26 |
from graphrag.entity_resolution import EntityResolution
|
| 27 |
from graphrag.graph_extractor import GraphExtractor
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def be_children(obj: dict, keyset:set):
|
| 34 |
+
if isinstance(obj, str):
|
| 35 |
+
obj = [obj]
|
| 36 |
+
if isinstance(obj, list):
|
| 37 |
+
for i in obj: keyset.add(i)
|
| 38 |
+
return [{"id": i, "children":[]} for i in obj]
|
| 39 |
arr = []
|
| 40 |
for k,v in obj.items():
|
| 41 |
k = re.sub(r"\*+", "", k)
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
|
| 74 |
+
_, tenant = TenantService.get_by_id(tenant_id)
|
| 75 |
+
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
|
| 76 |
ext = GraphExtractor(llm_bdl)
|
| 77 |
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
|
| 78 |
left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
|
graphrag/mind_map_extractor.py
CHANGED
|
@@ -13,7 +13,9 @@
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
import logging
|
| 18 |
import traceback
|
| 19 |
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -65,7 +67,7 @@ class MindMapExtractor:
|
|
| 65 |
try:
|
| 66 |
exe = ThreadPoolExecutor(max_workers=12)
|
| 67 |
threads = []
|
| 68 |
-
token_count = self._llm.max_length * 0.
|
| 69 |
texts = []
|
| 70 |
res = []
|
| 71 |
cnt = 0
|
|
@@ -122,6 +124,19 @@ class MindMapExtractor:
|
|
| 122 |
continue
|
| 123 |
return data
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def _process_document(
|
| 126 |
self, text: str, prompt_variables: dict[str, str]
|
| 127 |
) -> str:
|
|
@@ -132,6 +147,7 @@ class MindMapExtractor:
|
|
| 132 |
text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
|
| 133 |
gen_conf = {"temperature": 0.5}
|
| 134 |
response = self._llm.chat(text, [], gen_conf)
|
|
|
|
| 135 |
print(response)
|
| 136 |
-
print("---------------------------------------------------\n", markdown_to_json.dictify(response))
|
| 137 |
-
return
|
|
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
+
import collections
|
| 17 |
+
import logging
|
| 18 |
+
import re
|
| 19 |
import logging
|
| 20 |
import traceback
|
| 21 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
| 67 |
try:
|
| 68 |
exe = ThreadPoolExecutor(max_workers=12)
|
| 69 |
threads = []
|
| 70 |
+
token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
|
| 71 |
texts = []
|
| 72 |
res = []
|
| 73 |
cnt = 0
|
|
|
|
| 124 |
continue
|
| 125 |
return data
|
| 126 |
|
| 127 |
+
def _todict(self, layer:collections.OrderedDict):
|
| 128 |
+
to_ret = layer
|
| 129 |
+
if isinstance(layer, collections.OrderedDict):
|
| 130 |
+
to_ret = dict(layer)
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
for key, value in to_ret.items():
|
| 134 |
+
to_ret[key] = self._todict(value)
|
| 135 |
+
except AttributeError:
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
return self._list_to_kv(to_ret)
|
| 139 |
+
|
| 140 |
def _process_document(
|
| 141 |
self, text: str, prompt_variables: dict[str, str]
|
| 142 |
) -> str:
|
|
|
|
| 147 |
text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
|
| 148 |
gen_conf = {"temperature": 0.5}
|
| 149 |
response = self._llm.chat(text, [], gen_conf)
|
| 150 |
+
response = re.sub(r"```[^\n]*", "", response)
|
| 151 |
print(response)
|
| 152 |
+
print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
|
| 153 |
+
return self._todict(markdown_to_json.dictify(response))
|
graphrag/mind_map_prompt.py
CHANGED
|
@@ -14,28 +14,20 @@
|
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
MIND_MAP_EXTRACTION_PROMPT = """
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
- Output requirement:
|
| 25 |
-
- In language of
|
| 26 |
-
- MUST IN FORMAT OF MARKDOWN
|
| 27 |
-
|
| 28 |
-
Output:
|
| 29 |
-
## <Title>
|
| 30 |
-
<Section Name>
|
| 31 |
-
<Section Name>
|
| 32 |
-
<Subsection Name>
|
| 33 |
-
<Subsection Name>
|
| 34 |
-
<Section Name>
|
| 35 |
-
<Subsection Name>
|
| 36 |
-
|
| 37 |
-TEXT-
|
| 38 |
{input_text}
|
| 39 |
|
| 40 |
-
Output:
|
| 41 |
"""
|
|
|
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
MIND_MAP_EXTRACTION_PROMPT = """
|
| 17 |
+
- Role: You're a talent text processor to summarize a piece of text into a mind map.
|
| 18 |
|
| 19 |
+
- Step of task:
|
| 20 |
+
1. Generate a title for user's 'TEXT'。
|
| 21 |
+
2. Classify the 'TEXT' into sections of a mind map.
|
| 22 |
+
3. If the subject matter is really complex, split them into sub-sections and sub-subsections.
|
| 23 |
+
4. Add a shot content summary of the bottom level section.
|
| 24 |
+
|
| 25 |
+
- Output requirement:
|
| 26 |
+
- Always try to maximize the number of sub-sections.
|
| 27 |
+
- In language of 'Text'
|
| 28 |
+
- MUST IN FORMAT OF MARKDOWN
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
-TEXT-
|
| 31 |
{input_text}
|
| 32 |
|
|
|
|
| 33 |
"""
|