Kevin Hu commited on
Commit
6a44b6e
·
1 Parent(s): d696cd8

fix uploading docx for mind map (#2064)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/db/services/document_service.py CHANGED
@@ -17,6 +17,8 @@ import hashlib
17
  import json
18
  import os
19
  import random
 
 
20
  from concurrent.futures import ThreadPoolExecutor
21
  from copy import deepcopy
22
  from datetime import datetime
@@ -33,7 +35,7 @@ from graphrag.mind_map_extractor import MindMapExtractor
33
  from rag.settings import SVR_QUEUE_NAME
34
  from rag.utils.es_conn import ELASTICSEARCH
35
  from rag.utils.minio_conn import MINIO
36
- from rag.nlp import search
37
 
38
  from api.db import FileType, TaskStatus, ParserType, LLMType
39
  from api.db.db_models import DB, Knowledgebase, Tenant, Task
@@ -432,6 +434,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
432
  parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
433
  exe = ThreadPoolExecutor(max_workers=12)
434
  threads = []
 
 
 
435
  for d, blob in files:
436
  kwargs = {
437
  "callback": dummy,
@@ -504,6 +509,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
504
  "id": get_uuid(),
505
  "doc_id": doc_id,
506
  "kb_id": [kb.id],
 
 
 
507
  "content_with_weight": mind_map,
508
  "knowledge_graph_kwd": "mind_map"
509
  })
 
17
  import json
18
  import os
19
  import random
20
+ import re
21
+ import traceback
22
  from concurrent.futures import ThreadPoolExecutor
23
  from copy import deepcopy
24
  from datetime import datetime
 
35
  from rag.settings import SVR_QUEUE_NAME
36
  from rag.utils.es_conn import ELASTICSEARCH
37
  from rag.utils.minio_conn import MINIO
38
+ from rag.nlp import search, rag_tokenizer
39
 
40
  from api.db import FileType, TaskStatus, ParserType, LLMType
41
  from api.db.db_models import DB, Knowledgebase, Tenant, Task
 
434
  parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
435
  exe = ThreadPoolExecutor(max_workers=12)
436
  threads = []
437
+ doc_nm = {}
438
+ for d, blob in files:
439
+ doc_nm[d["id"]] = d["name"]
440
  for d, blob in files:
441
  kwargs = {
442
  "callback": dummy,
 
509
  "id": get_uuid(),
510
  "doc_id": doc_id,
511
  "kb_id": [kb.id],
512
+ "docnm_kwd": doc_nm[doc_id],
513
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc_nm[doc_id])),
514
+ "content_ltks": "",
515
  "content_with_weight": mind_map,
516
  "knowledge_graph_kwd": "mind_map"
517
  })