Spaces:

retopara
/

ragflow

Build error

Kevin Hu commited on Aug 6, 2024

Commit

b6ce919

1 Parent(s): c530565

refine mindmap (#1817)

### What problem does this PR solve?

#1594
### Type of change

- [x] Refactoring

Files changed (3) hide show

graphrag/index.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import List
 import networkx as nx
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from graphrag.community_reports_extractor import CommunityReportsExtractor
 from graphrag.entity_resolution import EntityResolution
 from graphrag.graph_extractor import GraphExtractor
@@ -30,6 +31,11 @@ from rag.utils import num_tokens_from_string
 def be_children(obj: dict, keyset:set):
     arr = []
     for k,v in obj.items():
         k = re.sub(r"\*+", "", k)
@@ -65,7 +71,8 @@ def graph_merge(g1, g2):
 def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
-    llm_bdl = LLMBundle(tenant_id, LLMType.CHAT)
     ext = GraphExtractor(llm_bdl)
     left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
     left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)

 import networkx as nx
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
+from api.db.services.user_service import TenantService
 from graphrag.community_reports_extractor import CommunityReportsExtractor
 from graphrag.entity_resolution import EntityResolution
 from graphrag.graph_extractor import GraphExtractor
 def be_children(obj: dict, keyset:set):
+    if isinstance(obj, str):
+        obj = [obj]
+    if isinstance(obj, list):
+        for i in obj: keyset.add(i)
+        return [{"id": i, "children":[]} for i in obj]
     arr = []
     for k,v in obj.items():
         k = re.sub(r"\*+", "", k)
 def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
+    _, tenant = TenantService.get_by_id(tenant_id)
+    llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
     ext = GraphExtractor(llm_bdl)
     left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
     left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)

graphrag/mind_map_extractor.py CHANGED Viewed

@@ -13,7 +13,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import logging
 import traceback
 from concurrent.futures import ThreadPoolExecutor
@@ -65,7 +67,7 @@ class MindMapExtractor:
         try:
             exe = ThreadPoolExecutor(max_workers=12)
             threads = []
-            token_count = self._llm.max_length * 0.7
             texts = []
             res = []
             cnt = 0
@@ -122,6 +124,19 @@ class MindMapExtractor:
                 continue
         return data
     def _process_document(
             self, text: str, prompt_variables: dict[str, str]
     ) -> str:
@@ -132,6 +147,7 @@ class MindMapExtractor:
         text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
         gen_conf = {"temperature": 0.5}
         response = self._llm.chat(text, [], gen_conf)
         print(response)
-        print("---------------------------------------------------\n", markdown_to_json.dictify(response))
-        return dict(markdown_to_json.dictify(response))

 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import collections
+import logging
+import re
 import logging
 import traceback
 from concurrent.futures import ThreadPoolExecutor
         try:
             exe = ThreadPoolExecutor(max_workers=12)
             threads = []
+            token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
             texts = []
             res = []
             cnt = 0
                 continue
         return data
+    def _todict(self, layer:collections.OrderedDict):
+        to_ret = layer
+        if isinstance(layer, collections.OrderedDict):
+            to_ret = dict(layer)
+        try:
+            for key, value in to_ret.items():
+                to_ret[key] = self._todict(value)
+        except AttributeError:
+            pass
+        return self._list_to_kv(to_ret)
     def _process_document(
             self, text: str, prompt_variables: dict[str, str]
     ) -> str:
         text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
         gen_conf = {"temperature": 0.5}
         response = self._llm.chat(text, [], gen_conf)
+        response = re.sub(r"```[^\n]*", "", response)
         print(response)
+        print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
+        return self._todict(markdown_to_json.dictify(response))

graphrag/mind_map_prompt.py CHANGED Viewed

@@ -14,28 +14,20 @@
 #  limitations under the License.
 #
 MIND_MAP_EXTRACTION_PROMPT = """
- - Role: You're a talent text processor.
- - Step of task:
-   1. Generate a title for user's 'TEXT'。
-   2. Classify the 'TEXT' into sections as you see fit.
-   3. If the subject matter is really complex, split them into sub-sections.
- - Output requirement:
-   - In language of
-   - MUST IN FORMAT OF MARKDOWN
-Output:
-## <Title>
-  <Section Name>
-  <Section Name>
-    <Subsection Name>
-    <Subsection Name>
-  <Section Name>
-    <Subsection Name>
 -TEXT-
 {input_text}
-Output:
 """

 #  limitations under the License.
 #
 MIND_MAP_EXTRACTION_PROMPT = """
+- Role: You're a talent text processor to summarize a piece of text into a mind map.
+- Step of task:
+  1. Generate a title for user's 'TEXT'。
+  2. Classify the 'TEXT' into sections of a mind map.
+  3. If the subject matter is really complex, split them into sub-sections and sub-subsections.
+  4. Add a shot content summary of the bottom level section.
+- Output requirement:
+  - Always try to maximize the number of sub-sections.
+  - In language of 'Text'
+  - MUST IN FORMAT OF MARKDOWN
 -TEXT-
 {input_text}
 """