Kevin Hu
commited on
Commit
·
3b7343c
1
Parent(s):
73c78d3
refine mindmap prompt (#1808)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
api/db/services/document_service.py
CHANGED
|
@@ -142,7 +142,7 @@ class DocumentService(CommonService):
|
|
| 142 |
@classmethod
|
| 143 |
@DB.connection_context()
|
| 144 |
def get_unfinished_docs(cls):
|
| 145 |
-
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg]
|
| 146 |
docs = cls.model.select(*fields) \
|
| 147 |
.where(
|
| 148 |
cls.model.status == StatusEnum.VALID.value,
|
|
@@ -311,7 +311,7 @@ class DocumentService(CommonService):
|
|
| 311 |
prg = 0
|
| 312 |
finished = True
|
| 313 |
bad = 0
|
| 314 |
-
status = TaskStatus.RUNNING.value
|
| 315 |
for t in tsks:
|
| 316 |
if 0 <= t.progress < 1:
|
| 317 |
finished = False
|
|
|
|
| 142 |
@classmethod
|
| 143 |
@DB.connection_context()
|
| 144 |
def get_unfinished_docs(cls):
|
| 145 |
+
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run]
|
| 146 |
docs = cls.model.select(*fields) \
|
| 147 |
.where(
|
| 148 |
cls.model.status == StatusEnum.VALID.value,
|
|
|
|
| 311 |
prg = 0
|
| 312 |
finished = True
|
| 313 |
bad = 0
|
| 314 |
+
status = d["run"]#TaskStatus.RUNNING.value
|
| 315 |
for t in tsks:
|
| 316 |
if 0 <= t.progress < 1:
|
| 317 |
finished = False
|
conf/llm_factories.json
CHANGED
|
@@ -92,12 +92,6 @@
|
|
| 92 |
"max_tokens": 32768,
|
| 93 |
"model_type": "chat"
|
| 94 |
},
|
| 95 |
-
{
|
| 96 |
-
"llm_name": "qwen-max-1201",
|
| 97 |
-
"tags": "LLM,CHAT,6K",
|
| 98 |
-
"max_tokens": 5899,
|
| 99 |
-
"model_type": "chat"
|
| 100 |
-
},
|
| 101 |
{
|
| 102 |
"llm_name": "text-embedding-v2",
|
| 103 |
"tags": "TEXT EMBEDDING,2K",
|
|
|
|
| 92 |
"max_tokens": 32768,
|
| 93 |
"model_type": "chat"
|
| 94 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
{
|
| 96 |
"llm_name": "text-embedding-v2",
|
| 97 |
"tags": "TEXT EMBEDDING,2K",
|
graphrag/mind_map_prompt.py
CHANGED
|
@@ -22,7 +22,6 @@ MIND_MAP_EXTRACTION_PROMPT = """
|
|
| 22 |
3. If the subject matter is really complex, split them into sub-sections.
|
| 23 |
|
| 24 |
- Output requirement:
|
| 25 |
-
- Always try to maximize the number of sub-sections.
|
| 26 |
- In language of
|
| 27 |
- MUST IN FORMAT OF MARKDOWN
|
| 28 |
|
|
|
|
| 22 |
3. If the subject matter is really complex, split them into sub-sections.
|
| 23 |
|
| 24 |
- Output requirement:
|
|
|
|
| 25 |
- In language of
|
| 26 |
- MUST IN FORMAT OF MARKDOWN
|
| 27 |
|
rag/app/knowledge_graph.py
CHANGED
|
@@ -13,7 +13,8 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
|
| 13 |
eng = lang.lower() == "english"
|
| 14 |
|
| 15 |
parser_config["layout_recognize"] = False
|
| 16 |
-
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
|
|
|
|
| 17 |
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
|
| 18 |
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
|
| 19 |
)
|
|
@@ -27,4 +28,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
|
| 27 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
| 28 |
chunks.extend(tokenize_chunks(sections, doc, eng))
|
| 29 |
|
| 30 |
-
return chunks
|
|
|
|
| 13 |
eng = lang.lower() == "english"
|
| 14 |
|
| 15 |
parser_config["layout_recognize"] = False
|
| 16 |
+
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
|
| 17 |
+
parser_config=parser_config, callback=callback)
|
| 18 |
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
|
| 19 |
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
|
| 20 |
)
|
|
|
|
| 28 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
| 29 |
chunks.extend(tokenize_chunks(sections, doc, eng))
|
| 30 |
|
| 31 |
+
return chunks
|