Kevin Hu
commited on
Commit
·
6a44b6e
1
Parent(s):
d696cd8
fix uploading docx for mind map (#2064)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
api/db/services/document_service.py
CHANGED
@@ -17,6 +17,8 @@ import hashlib
|
|
17 |
import json
|
18 |
import os
|
19 |
import random
|
|
|
|
|
20 |
from concurrent.futures import ThreadPoolExecutor
|
21 |
from copy import deepcopy
|
22 |
from datetime import datetime
|
@@ -33,7 +35,7 @@ from graphrag.mind_map_extractor import MindMapExtractor
|
|
33 |
from rag.settings import SVR_QUEUE_NAME
|
34 |
from rag.utils.es_conn import ELASTICSEARCH
|
35 |
from rag.utils.minio_conn import MINIO
|
36 |
-
from rag.nlp import search
|
37 |
|
38 |
from api.db import FileType, TaskStatus, ParserType, LLMType
|
39 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
@@ -432,6 +434,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|
432 |
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
433 |
exe = ThreadPoolExecutor(max_workers=12)
|
434 |
threads = []
|
|
|
|
|
|
|
435 |
for d, blob in files:
|
436 |
kwargs = {
|
437 |
"callback": dummy,
|
@@ -504,6 +509,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|
504 |
"id": get_uuid(),
|
505 |
"doc_id": doc_id,
|
506 |
"kb_id": [kb.id],
|
|
|
|
|
|
|
507 |
"content_with_weight": mind_map,
|
508 |
"knowledge_graph_kwd": "mind_map"
|
509 |
})
|
|
|
17 |
import json
|
18 |
import os
|
19 |
import random
|
20 |
+
import re
|
21 |
+
import traceback
|
22 |
from concurrent.futures import ThreadPoolExecutor
|
23 |
from copy import deepcopy
|
24 |
from datetime import datetime
|
|
|
35 |
from rag.settings import SVR_QUEUE_NAME
|
36 |
from rag.utils.es_conn import ELASTICSEARCH
|
37 |
from rag.utils.minio_conn import MINIO
|
38 |
+
from rag.nlp import search, rag_tokenizer
|
39 |
|
40 |
from api.db import FileType, TaskStatus, ParserType, LLMType
|
41 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
|
|
434 |
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
435 |
exe = ThreadPoolExecutor(max_workers=12)
|
436 |
threads = []
|
437 |
+
doc_nm = {}
|
438 |
+
for d, blob in files:
|
439 |
+
doc_nm[d["id"]] = d["name"]
|
440 |
for d, blob in files:
|
441 |
kwargs = {
|
442 |
"callback": dummy,
|
|
|
509 |
"id": get_uuid(),
|
510 |
"doc_id": doc_id,
|
511 |
"kb_id": [kb.id],
|
512 |
+
"docnm_kwd": doc_nm[doc_id],
|
513 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc_nm[doc_id])),
|
514 |
+
"content_ltks": "",
|
515 |
"content_with_weight": mind_map,
|
516 |
"knowledge_graph_kwd": "mind_map"
|
517 |
})
|