Kevin Hu commited on
Commit
3b7343c
·
1 Parent(s): 73c78d3

refine mindmap prompt (#1808)

Browse files

### What problem does this PR solve?



### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/db/services/document_service.py CHANGED
@@ -142,7 +142,7 @@ class DocumentService(CommonService):
142
  @classmethod
143
  @DB.connection_context()
144
  def get_unfinished_docs(cls):
145
- fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg]
146
  docs = cls.model.select(*fields) \
147
  .where(
148
  cls.model.status == StatusEnum.VALID.value,
@@ -311,7 +311,7 @@ class DocumentService(CommonService):
311
  prg = 0
312
  finished = True
313
  bad = 0
314
- status = TaskStatus.RUNNING.value
315
  for t in tsks:
316
  if 0 <= t.progress < 1:
317
  finished = False
 
142
  @classmethod
143
  @DB.connection_context()
144
  def get_unfinished_docs(cls):
145
+ fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run]
146
  docs = cls.model.select(*fields) \
147
  .where(
148
  cls.model.status == StatusEnum.VALID.value,
 
311
  prg = 0
312
  finished = True
313
  bad = 0
314
+ status = d["run"]#TaskStatus.RUNNING.value
315
  for t in tsks:
316
  if 0 <= t.progress < 1:
317
  finished = False
conf/llm_factories.json CHANGED
@@ -92,12 +92,6 @@
92
  "max_tokens": 32768,
93
  "model_type": "chat"
94
  },
95
- {
96
- "llm_name": "qwen-max-1201",
97
- "tags": "LLM,CHAT,6K",
98
- "max_tokens": 5899,
99
- "model_type": "chat"
100
- },
101
  {
102
  "llm_name": "text-embedding-v2",
103
  "tags": "TEXT EMBEDDING,2K",
 
92
  "max_tokens": 32768,
93
  "model_type": "chat"
94
  },
 
 
 
 
 
 
95
  {
96
  "llm_name": "text-embedding-v2",
97
  "tags": "TEXT EMBEDDING,2K",
graphrag/mind_map_prompt.py CHANGED
@@ -22,7 +22,6 @@ MIND_MAP_EXTRACTION_PROMPT = """
22
  3. If the subject matter is really complex, split them into sub-sections.
23
 
24
  - Output requirement:
25
- - Always try to maximize the number of sub-sections.
26
  - In language of
27
  - MUST IN FORMAT OF MARKDOWN
28
 
 
22
  3. If the subject matter is really complex, split them into sub-sections.
23
 
24
  - Output requirement:
 
25
  - In language of
26
  - MUST IN FORMAT OF MARKDOWN
27
 
rag/app/knowledge_graph.py CHANGED
@@ -13,7 +13,8 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
13
  eng = lang.lower() == "english"
14
 
15
  parser_config["layout_recognize"] = False
16
- sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config)
 
17
  chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
18
  parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
19
  )
@@ -27,4 +28,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
27
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
28
  chunks.extend(tokenize_chunks(sections, doc, eng))
29
 
30
- return chunks
 
13
  eng = lang.lower() == "english"
14
 
15
  parser_config["layout_recognize"] = False
16
+ sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
17
+ parser_config=parser_config, callback=callback)
18
  chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
19
  parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
20
  )
 
28
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
29
  chunks.extend(tokenize_chunks(sections, doc, eng))
30
 
31
+ return chunks