Kevin Hu
commited on
Commit
·
b71b66c
1
Parent(s):
6b2a674
add function: upload and parse (#1889)
Browse files### What problem does this PR solve?
#1880
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/conversation_app.py +2 -0
- api/apps/document_app.py +148 -56
- api/db/services/dialog_service.py +7 -3
- api/db/services/document_service.py +1 -1
- api/db/services/file_service.py +63 -2
- graphrag/index.py +2 -26
- graphrag/mind_map_extractor.py +31 -1
- graphrag/mind_map_prompt.py +1 -0
api/apps/conversation_app.py
CHANGED
@@ -118,6 +118,8 @@ def completion():
|
|
118 |
if m["role"] == "assistant" and not msg:
|
119 |
continue
|
120 |
msg.append({"role": m["role"], "content": m["content"]})
|
|
|
|
|
121 |
try:
|
122 |
e, conv = ConversationService.get_by_id(req["conversation_id"])
|
123 |
if not e:
|
|
|
118 |
if m["role"] == "assistant" and not msg:
|
119 |
continue
|
120 |
msg.append({"role": m["role"], "content": m["content"]})
|
121 |
+
if "doc_ids" in m:
|
122 |
+
msg[-1]["doc_ids"] = m["doc_ids"]
|
123 |
try:
|
124 |
e, conv = ConversationService.get_by_id(req["conversation_id"])
|
125 |
if not e:
|
api/apps/document_app.py
CHANGED
@@ -13,10 +13,16 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
16 |
-
|
|
|
|
|
17 |
import os
|
18 |
import pathlib
|
19 |
import re
|
|
|
|
|
|
|
|
|
20 |
|
21 |
import flask
|
22 |
from elasticsearch_dsl import Q
|
@@ -24,22 +30,26 @@ from flask import request
|
|
24 |
from flask_login import login_required, current_user
|
25 |
|
26 |
from api.db.db_models import Task, File
|
|
|
27 |
from api.db.services.file2document_service import File2DocumentService
|
28 |
from api.db.services.file_service import FileService
|
|
|
29 |
from api.db.services.task_service import TaskService, queue_tasks
|
|
|
|
|
|
|
30 |
from rag.nlp import search
|
31 |
from rag.utils.es_conn import ELASTICSEARCH
|
32 |
from api.db.services import duplicate_name
|
33 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
34 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
35 |
from api.utils import get_uuid
|
36 |
-
from api.db import FileType, TaskStatus, ParserType, FileSource
|
37 |
from api.db.services.document_service import DocumentService
|
38 |
-
from api.settings import RetCode
|
39 |
from api.utils.api_utils import get_json_result
|
40 |
from rag.utils.minio_conn import MINIO
|
41 |
-
from api.utils.file_utils import filename_type, thumbnail
|
42 |
-
from api.utils.web_utils import html2pdf, is_valid_url
|
43 |
from api.utils.web_utils import html2pdf, is_valid_url
|
44 |
|
45 |
|
@@ -65,55 +75,7 @@ def upload():
|
|
65 |
if not e:
|
66 |
raise LookupError("Can't find this knowledgebase!")
|
67 |
|
68 |
-
|
69 |
-
pf_id = root_folder["id"]
|
70 |
-
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
71 |
-
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
72 |
-
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
73 |
-
|
74 |
-
err = []
|
75 |
-
for file in file_objs:
|
76 |
-
try:
|
77 |
-
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
78 |
-
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
79 |
-
raise RuntimeError("Exceed the maximum file number of a free user!")
|
80 |
-
|
81 |
-
filename = duplicate_name(
|
82 |
-
DocumentService.query,
|
83 |
-
name=file.filename,
|
84 |
-
kb_id=kb.id)
|
85 |
-
filetype = filename_type(filename)
|
86 |
-
if filetype == FileType.OTHER.value:
|
87 |
-
raise RuntimeError("This type of file has not been supported yet!")
|
88 |
-
|
89 |
-
location = filename
|
90 |
-
while MINIO.obj_exist(kb_id, location):
|
91 |
-
location += "_"
|
92 |
-
blob = file.read()
|
93 |
-
MINIO.put(kb_id, location, blob)
|
94 |
-
doc = {
|
95 |
-
"id": get_uuid(),
|
96 |
-
"kb_id": kb.id,
|
97 |
-
"parser_id": kb.parser_id,
|
98 |
-
"parser_config": kb.parser_config,
|
99 |
-
"created_by": current_user.id,
|
100 |
-
"type": filetype,
|
101 |
-
"name": filename,
|
102 |
-
"location": location,
|
103 |
-
"size": len(blob),
|
104 |
-
"thumbnail": thumbnail(filename, blob)
|
105 |
-
}
|
106 |
-
if doc["type"] == FileType.VISUAL:
|
107 |
-
doc["parser_id"] = ParserType.PICTURE.value
|
108 |
-
if doc["type"] == FileType.AURAL:
|
109 |
-
doc["parser_id"] = ParserType.AUDIO.value
|
110 |
-
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
111 |
-
doc["parser_id"] = ParserType.PRESENTATION.value
|
112 |
-
DocumentService.insert(doc)
|
113 |
-
|
114 |
-
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
115 |
-
except Exception as e:
|
116 |
-
err.append(file.filename + ": " + str(e))
|
117 |
if err:
|
118 |
return get_json_result(
|
119 |
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
@@ -149,7 +111,7 @@ def web_crawl():
|
|
149 |
try:
|
150 |
filename = duplicate_name(
|
151 |
DocumentService.query,
|
152 |
-
name=name+".pdf",
|
153 |
kb_id=kb.id)
|
154 |
filetype = filename_type(filename)
|
155 |
if filetype == FileType.OTHER.value:
|
@@ -414,7 +376,7 @@ def get(doc_id):
|
|
414 |
if not e:
|
415 |
return get_data_error_result(retmsg="Document not found!")
|
416 |
|
417 |
-
b,n = File2DocumentService.get_minio_address(doc_id=doc_id)
|
418 |
response = flask.make_response(MINIO.get(b, n))
|
419 |
|
420 |
ext = re.search(r"\.([^.]+)$", doc.name)
|
@@ -484,3 +446,133 @@ def get_image(image_id):
|
|
484 |
return response
|
485 |
except Exception as e:
|
486 |
return server_error_response(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
16 |
+
import datetime
|
17 |
+
import hashlib
|
18 |
+
import json
|
19 |
import os
|
20 |
import pathlib
|
21 |
import re
|
22 |
+
import traceback
|
23 |
+
from concurrent.futures import ThreadPoolExecutor
|
24 |
+
from copy import deepcopy
|
25 |
+
from io import BytesIO
|
26 |
|
27 |
import flask
|
28 |
from elasticsearch_dsl import Q
|
|
|
30 |
from flask_login import login_required, current_user
|
31 |
|
32 |
from api.db.db_models import Task, File
|
33 |
+
from api.db.services.dialog_service import DialogService, ConversationService
|
34 |
from api.db.services.file2document_service import File2DocumentService
|
35 |
from api.db.services.file_service import FileService
|
36 |
+
from api.db.services.llm_service import LLMBundle
|
37 |
from api.db.services.task_service import TaskService, queue_tasks
|
38 |
+
from api.db.services.user_service import TenantService
|
39 |
+
from graphrag.mind_map_extractor import MindMapExtractor
|
40 |
+
from rag.app import naive
|
41 |
from rag.nlp import search
|
42 |
from rag.utils.es_conn import ELASTICSEARCH
|
43 |
from api.db.services import duplicate_name
|
44 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
45 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
46 |
from api.utils import get_uuid
|
47 |
+
from api.db import FileType, TaskStatus, ParserType, FileSource, LLMType
|
48 |
from api.db.services.document_service import DocumentService
|
49 |
+
from api.settings import RetCode, stat_logger
|
50 |
from api.utils.api_utils import get_json_result
|
51 |
from rag.utils.minio_conn import MINIO
|
52 |
+
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
|
|
53 |
from api.utils.web_utils import html2pdf, is_valid_url
|
54 |
|
55 |
|
|
|
75 |
if not e:
|
76 |
raise LookupError("Can't find this knowledgebase!")
|
77 |
|
78 |
+
err, _ = FileService.upload_document(kb, file_objs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
if err:
|
80 |
return get_json_result(
|
81 |
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
|
|
111 |
try:
|
112 |
filename = duplicate_name(
|
113 |
DocumentService.query,
|
114 |
+
name=name + ".pdf",
|
115 |
kb_id=kb.id)
|
116 |
filetype = filename_type(filename)
|
117 |
if filetype == FileType.OTHER.value:
|
|
|
376 |
if not e:
|
377 |
return get_data_error_result(retmsg="Document not found!")
|
378 |
|
379 |
+
b, n = File2DocumentService.get_minio_address(doc_id=doc_id)
|
380 |
response = flask.make_response(MINIO.get(b, n))
|
381 |
|
382 |
ext = re.search(r"\.([^.]+)$", doc.name)
|
|
|
446 |
return response
|
447 |
except Exception as e:
|
448 |
return server_error_response(e)
|
449 |
+
|
450 |
+
|
451 |
+
@manager.route('/upload_and_parse', methods=['POST'])
|
452 |
+
@login_required
|
453 |
+
@validate_request("conversation_id")
|
454 |
+
def upload_and_parse():
|
455 |
+
req = request.json
|
456 |
+
if 'file' not in request.files:
|
457 |
+
return get_json_result(
|
458 |
+
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
459 |
+
|
460 |
+
file_objs = request.files.getlist('file')
|
461 |
+
for file_obj in file_objs:
|
462 |
+
if file_obj.filename == '':
|
463 |
+
return get_json_result(
|
464 |
+
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
465 |
+
|
466 |
+
e, conv = ConversationService.get_by_id(req["conversation_id"])
|
467 |
+
if not e:
|
468 |
+
return get_data_error_result(retmsg="Conversation not found!")
|
469 |
+
e, dia = DialogService.get_by_id(conv.dialog_id)
|
470 |
+
kb_id = dia.kb_ids[0]
|
471 |
+
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
472 |
+
if not e:
|
473 |
+
raise LookupError("Can't find this knowledgebase!")
|
474 |
+
|
475 |
+
idxnm = search.index_name(kb.tenant_id)
|
476 |
+
if not ELASTICSEARCH.indexExist(idxnm):
|
477 |
+
ELASTICSEARCH.createIdx(idxnm, json.load(
|
478 |
+
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
479 |
+
|
480 |
+
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id, lang=kb.language)
|
481 |
+
|
482 |
+
err, files = FileService.upload_document(kb, file_objs)
|
483 |
+
if err:
|
484 |
+
return get_json_result(
|
485 |
+
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
486 |
+
|
487 |
+
def dummy(prog=None, msg=""):
|
488 |
+
pass
|
489 |
+
|
490 |
+
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?。;!?", "layout_recognize": False}
|
491 |
+
exe = ThreadPoolExecutor(max_workers=12)
|
492 |
+
threads = []
|
493 |
+
for d, blob in files:
|
494 |
+
kwargs = {
|
495 |
+
"callback": dummy,
|
496 |
+
"parser_config": parser_config,
|
497 |
+
"from_page": 0,
|
498 |
+
"to_page": 100000
|
499 |
+
}
|
500 |
+
threads.append(exe.submit(naive.chunk, d["name"], blob, **kwargs))
|
501 |
+
|
502 |
+
for (docinfo,_), th in zip(files, threads):
|
503 |
+
docs = []
|
504 |
+
doc = {
|
505 |
+
"doc_id": docinfo["id"],
|
506 |
+
"kb_id": [kb.id]
|
507 |
+
}
|
508 |
+
for ck in th.result():
|
509 |
+
d = deepcopy(doc)
|
510 |
+
d.update(ck)
|
511 |
+
md5 = hashlib.md5()
|
512 |
+
md5.update((ck["content_with_weight"] +
|
513 |
+
str(d["doc_id"])).encode("utf-8"))
|
514 |
+
d["_id"] = md5.hexdigest()
|
515 |
+
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
516 |
+
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
517 |
+
if not d.get("image"):
|
518 |
+
docs.append(d)
|
519 |
+
continue
|
520 |
+
|
521 |
+
output_buffer = BytesIO()
|
522 |
+
if isinstance(d["image"], bytes):
|
523 |
+
output_buffer = BytesIO(d["image"])
|
524 |
+
else:
|
525 |
+
d["image"].save(output_buffer, format='JPEG')
|
526 |
+
|
527 |
+
MINIO.put(kb.id, d["_id"], output_buffer.getvalue())
|
528 |
+
d["img_id"] = "{}-{}".format(kb.id, d["_id"])
|
529 |
+
del d["image"]
|
530 |
+
docs.append(d)
|
531 |
+
|
532 |
+
parser_ids = {d["id"]: d["parser_id"] for d, _ in files}
|
533 |
+
docids = [d["id"] for d, _ in files]
|
534 |
+
chunk_counts = {id: 0 for id in docids}
|
535 |
+
token_counts = {id: 0 for id in docids}
|
536 |
+
es_bulk_size = 64
|
537 |
+
|
538 |
+
def embedding(doc_id, cnts, batch_size=16):
|
539 |
+
nonlocal embd_mdl, chunk_counts, token_counts
|
540 |
+
vects = []
|
541 |
+
for i in range(0, len(cnts), batch_size):
|
542 |
+
vts, c = embd_mdl.encode(cnts[i: i + batch_size])
|
543 |
+
vects.extend(vts.tolist())
|
544 |
+
chunk_counts[doc_id] += len(cnts[i:i + batch_size])
|
545 |
+
token_counts[doc_id] += c
|
546 |
+
return vects
|
547 |
+
|
548 |
+
_, tenant = TenantService.get_by_id(kb.tenant_id)
|
549 |
+
llm_bdl = LLMBundle(kb.tenant_id, LLMType.CHAT, tenant.llm_id)
|
550 |
+
for doc_id in docids:
|
551 |
+
cks = [c for c in docs if c["doc_id"] == doc_id]
|
552 |
+
|
553 |
+
if parser_ids[doc_id] != ParserType.PICTURE.value:
|
554 |
+
mindmap = MindMapExtractor(llm_bdl)
|
555 |
+
try:
|
556 |
+
mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
|
557 |
+
if len(mind_map) < 32: raise Exception("Few content: "+mind_map)
|
558 |
+
cks.append({
|
559 |
+
"doc_id": doc_id,
|
560 |
+
"kb_id": [kb.id],
|
561 |
+
"content_with_weight": mind_map,
|
562 |
+
"knowledge_graph_kwd": "mind_map"
|
563 |
+
})
|
564 |
+
except Exception as e:
|
565 |
+
stat_logger.error("Mind map generation error:", traceback.format_exc())
|
566 |
+
|
567 |
+
vects = embedding(doc_id, cks)
|
568 |
+
assert len(cks) == len(vects)
|
569 |
+
for i, d in enumerate(cks):
|
570 |
+
v = vects[i]
|
571 |
+
d["q_%d_vec" % len(v)] = v
|
572 |
+
for b in range(0, len(cks), es_bulk_size):
|
573 |
+
ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], idxnm)
|
574 |
+
|
575 |
+
DocumentService.increment_chunk_num(
|
576 |
+
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
|
577 |
+
|
578 |
+
return get_json_result(data=[d["id"] for d in files])
|
api/db/services/dialog_service.py
CHANGED
@@ -104,7 +104,11 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
104 |
is_kg = all([kb.parser_id == ParserType.KG for kb in kbs])
|
105 |
retr = retrievaler if not is_kg else kg_retrievaler
|
106 |
|
107 |
-
questions = [m["content"] for m in messages if m["role"] == "user"]
|
|
|
|
|
|
|
|
|
108 |
embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
|
109 |
if llm_id2llm_type(dialog.llm_id) == "image2text":
|
110 |
chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id)
|
@@ -144,7 +148,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
144 |
kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
145 |
dialog.similarity_threshold,
|
146 |
dialog.vector_similarity_weight,
|
147 |
-
doc_ids=
|
148 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
149 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
150 |
#self-rag
|
@@ -153,7 +157,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
153 |
kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
154 |
dialog.similarity_threshold,
|
155 |
dialog.vector_similarity_weight,
|
156 |
-
doc_ids=
|
157 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
158 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
159 |
|
|
|
104 |
is_kg = all([kb.parser_id == ParserType.KG for kb in kbs])
|
105 |
retr = retrievaler if not is_kg else kg_retrievaler
|
106 |
|
107 |
+
questions = [m["content"] for m in messages if m["role"] == "user"][-3:]
|
108 |
+
attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None
|
109 |
+
if "doc_ids" in messages[-1]:
|
110 |
+
attachments = messages[-1]["doc_ids"]
|
111 |
+
|
112 |
embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
|
113 |
if llm_id2llm_type(dialog.llm_id) == "image2text":
|
114 |
chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id)
|
|
|
148 |
kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
149 |
dialog.similarity_threshold,
|
150 |
dialog.vector_similarity_weight,
|
151 |
+
doc_ids=attachments,
|
152 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
153 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
154 |
#self-rag
|
|
|
157 |
kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
158 |
dialog.similarity_threshold,
|
159 |
dialog.vector_similarity_weight,
|
160 |
+
doc_ids=attachments,
|
161 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
162 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
163 |
|
api/db/services/document_service.py
CHANGED
@@ -26,7 +26,7 @@ from rag.utils.es_conn import ELASTICSEARCH
|
|
26 |
from rag.utils.minio_conn import MINIO
|
27 |
from rag.nlp import search
|
28 |
|
29 |
-
from api.db import FileType, TaskStatus
|
30 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
31 |
from api.db.db_models import Document
|
32 |
from api.db.services.common_service import CommonService
|
|
|
26 |
from rag.utils.minio_conn import MINIO
|
27 |
from rag.nlp import search
|
28 |
|
29 |
+
from api.db import FileType, TaskStatus, ParserType
|
30 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
31 |
from api.db.db_models import Document
|
32 |
from api.db.services.common_service import CommonService
|
api/db/services/file_service.py
CHANGED
@@ -13,16 +13,21 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
|
|
16 |
from flask_login import current_user
|
17 |
from peewee import fn
|
18 |
|
19 |
-
from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource
|
20 |
from api.db.db_models import DB, File2Document, Knowledgebase
|
21 |
from api.db.db_models import File, Document
|
|
|
22 |
from api.db.services.common_service import CommonService
|
23 |
from api.db.services.document_service import DocumentService
|
24 |
from api.db.services.file2document_service import File2DocumentService
|
25 |
from api.utils import get_uuid
|
|
|
|
|
26 |
|
27 |
|
28 |
class FileService(CommonService):
|
@@ -318,4 +323,60 @@ class FileService(CommonService):
|
|
318 |
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
|
319 |
except Exception as e:
|
320 |
print(e)
|
321 |
-
raise RuntimeError("Database error (File move)!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
+
import re
|
17 |
+
|
18 |
from flask_login import current_user
|
19 |
from peewee import fn
|
20 |
|
21 |
+
from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource, ParserType
|
22 |
from api.db.db_models import DB, File2Document, Knowledgebase
|
23 |
from api.db.db_models import File, Document
|
24 |
+
from api.db.services import duplicate_name
|
25 |
from api.db.services.common_service import CommonService
|
26 |
from api.db.services.document_service import DocumentService
|
27 |
from api.db.services.file2document_service import File2DocumentService
|
28 |
from api.utils import get_uuid
|
29 |
+
from api.utils.file_utils import filename_type, thumbnail
|
30 |
+
from rag.utils.minio_conn import MINIO
|
31 |
|
32 |
|
33 |
class FileService(CommonService):
|
|
|
323 |
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
|
324 |
except Exception as e:
|
325 |
print(e)
|
326 |
+
raise RuntimeError("Database error (File move)!")
|
327 |
+
|
328 |
+
@classmethod
|
329 |
+
@DB.connection_context()
|
330 |
+
def upload_document(self, kb, file_objs):
|
331 |
+
root_folder = self.get_root_folder(current_user.id)
|
332 |
+
pf_id = root_folder["id"]
|
333 |
+
self.init_knowledgebase_docs(pf_id, current_user.id)
|
334 |
+
kb_root_folder = self.get_kb_folder(current_user.id)
|
335 |
+
kb_folder = self.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
336 |
+
|
337 |
+
err, files = [], []
|
338 |
+
for file in file_objs:
|
339 |
+
try:
|
340 |
+
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
341 |
+
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
342 |
+
raise RuntimeError("Exceed the maximum file number of a free user!")
|
343 |
+
|
344 |
+
filename = duplicate_name(
|
345 |
+
DocumentService.query,
|
346 |
+
name=file.filename,
|
347 |
+
kb_id=kb.id)
|
348 |
+
filetype = filename_type(filename)
|
349 |
+
if filetype == FileType.OTHER.value:
|
350 |
+
raise RuntimeError("This type of file has not been supported yet!")
|
351 |
+
|
352 |
+
location = filename
|
353 |
+
while MINIO.obj_exist(kb.id, location):
|
354 |
+
location += "_"
|
355 |
+
blob = file.read()
|
356 |
+
MINIO.put(kb.id, location, blob)
|
357 |
+
doc = {
|
358 |
+
"id": get_uuid(),
|
359 |
+
"kb_id": kb.id,
|
360 |
+
"parser_id": kb.parser_id,
|
361 |
+
"parser_config": kb.parser_config,
|
362 |
+
"created_by": current_user.id,
|
363 |
+
"type": filetype,
|
364 |
+
"name": filename,
|
365 |
+
"location": location,
|
366 |
+
"size": len(blob),
|
367 |
+
"thumbnail": thumbnail(filename, blob)
|
368 |
+
}
|
369 |
+
if doc["type"] == FileType.VISUAL:
|
370 |
+
doc["parser_id"] = ParserType.PICTURE.value
|
371 |
+
if doc["type"] == FileType.AURAL:
|
372 |
+
doc["parser_id"] = ParserType.AUDIO.value
|
373 |
+
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
374 |
+
doc["parser_id"] = ParserType.PRESENTATION.value
|
375 |
+
DocumentService.insert(doc)
|
376 |
+
|
377 |
+
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
378 |
+
files.append((doc, blob))
|
379 |
+
except Exception as e:
|
380 |
+
err.append(file.filename + ": " + str(e))
|
381 |
+
|
382 |
+
return err, files
|
graphrag/index.py
CHANGED
@@ -30,24 +30,6 @@ from rag.nlp import rag_tokenizer
|
|
30 |
from rag.utils import num_tokens_from_string
|
31 |
|
32 |
|
33 |
-
def be_children(obj: dict, keyset:set):
|
34 |
-
if isinstance(obj, str):
|
35 |
-
obj = [obj]
|
36 |
-
if isinstance(obj, list):
|
37 |
-
for i in obj: keyset.add(i)
|
38 |
-
return [{"id": re.sub(r"\*+", "", i), "children":[]} for i in obj]
|
39 |
-
arr = []
|
40 |
-
for k,v in obj.items():
|
41 |
-
k = re.sub(r"\*+", "", k)
|
42 |
-
if not k or k in keyset:continue
|
43 |
-
keyset.add(k)
|
44 |
-
arr.append({
|
45 |
-
"id": k,
|
46 |
-
"children": be_children(v, keyset)
|
47 |
-
})
|
48 |
-
return arr
|
49 |
-
|
50 |
-
|
51 |
def graph_merge(g1, g2):
|
52 |
g = g2.copy()
|
53 |
for n, attr in g1.nodes(data=True):
|
@@ -153,16 +135,10 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
|
|
153 |
mg = mindmap(_chunks).output
|
154 |
if not len(mg.keys()): return chunks
|
155 |
|
156 |
-
|
157 |
-
keyset = set([re.sub(r"\*+", "", k) for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
|
158 |
-
md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v, keyset)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
|
159 |
-
else:
|
160 |
-
k = re.sub(r"\*+", "", list(mg.keys())[0])
|
161 |
-
md_map = {"id": k, "children": be_children(list(mg.items())[0][1], set([k]))}
|
162 |
-
print(json.dumps(md_map, ensure_ascii=False, indent=2))
|
163 |
chunks.append(
|
164 |
{
|
165 |
-
"content_with_weight": json.dumps(
|
166 |
"knowledge_graph_kwd": "mind_map"
|
167 |
})
|
168 |
|
|
|
30 |
from rag.utils import num_tokens_from_string
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def graph_merge(g1, g2):
|
34 |
g = g2.copy()
|
35 |
for n, attr in g1.nodes(data=True):
|
|
|
135 |
mg = mindmap(_chunks).output
|
136 |
if not len(mg.keys()): return chunks
|
137 |
|
138 |
+
print(json.dumps(mg, ensure_ascii=False, indent=2))
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
chunks.append(
|
140 |
{
|
141 |
+
"content_with_weight": json.dumps(mg, ensure_ascii=False, indent=2),
|
142 |
"knowledge_graph_kwd": "mind_map"
|
143 |
})
|
144 |
|
graphrag/mind_map_extractor.py
CHANGED
@@ -57,6 +57,26 @@ class MindMapExtractor:
|
|
57 |
self._mind_map_prompt = prompt or MIND_MAP_EXTRACTION_PROMPT
|
58 |
self._on_error = on_error or (lambda _e, _s, _d: None)
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def __call__(
|
61 |
self, sections: list[str], prompt_variables: dict[str, Any] | None = None
|
62 |
) -> MindMapResult:
|
@@ -86,13 +106,23 @@ class MindMapExtractor:
|
|
86 |
res.append(_.result())
|
87 |
|
88 |
merge_json = reduce(self._merge, res)
|
89 |
-
merge_json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
except Exception as e:
|
91 |
logging.exception("error mind graph")
|
92 |
self._on_error(
|
93 |
e,
|
94 |
traceback.format_exc(), None
|
95 |
)
|
|
|
96 |
|
97 |
return MindMapResult(output=merge_json)
|
98 |
|
|
|
57 |
self._mind_map_prompt = prompt or MIND_MAP_EXTRACTION_PROMPT
|
58 |
self._on_error = on_error or (lambda _e, _s, _d: None)
|
59 |
|
60 |
+
def _key(self, k):
|
61 |
+
return re.sub(r"\*+", "", k)
|
62 |
+
|
63 |
+
def _be_children(self, obj: dict, keyset: set):
|
64 |
+
if isinstance(obj, str):
|
65 |
+
obj = [obj]
|
66 |
+
if isinstance(obj, list):
|
67 |
+
for i in obj: keyset.add(i)
|
68 |
+
return [{"id": re.sub(r"\*+", "", i), "children": []} for i in obj]
|
69 |
+
arr = []
|
70 |
+
for k, v in obj.items():
|
71 |
+
k = self._key(k)
|
72 |
+
if not k or k in keyset: continue
|
73 |
+
keyset.add(k)
|
74 |
+
arr.append({
|
75 |
+
"id": k,
|
76 |
+
"children": self._be_children(v, keyset)
|
77 |
+
})
|
78 |
+
return arr
|
79 |
+
|
80 |
def __call__(
|
81 |
self, sections: list[str], prompt_variables: dict[str, Any] | None = None
|
82 |
) -> MindMapResult:
|
|
|
106 |
res.append(_.result())
|
107 |
|
108 |
merge_json = reduce(self._merge, res)
|
109 |
+
if len(merge_json.keys()) > 1:
|
110 |
+
keyset = set(
|
111 |
+
[re.sub(r"\*+", "", k) for k, v in merge_json.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
|
112 |
+
merge_json = {"id": "root",
|
113 |
+
"children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
|
114 |
+
merge_json.items() if isinstance(v, dict) and self._key(k)]}
|
115 |
+
else:
|
116 |
+
k = self._key(list(self._be_children.keys())[0])
|
117 |
+
merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}
|
118 |
+
|
119 |
except Exception as e:
|
120 |
logging.exception("error mind graph")
|
121 |
self._on_error(
|
122 |
e,
|
123 |
traceback.format_exc(), None
|
124 |
)
|
125 |
+
merge_json = {"error": str(e)}
|
126 |
|
127 |
return MindMapResult(output=merge_json)
|
128 |
|
graphrag/mind_map_prompt.py
CHANGED
@@ -23,6 +23,7 @@ MIND_MAP_EXTRACTION_PROMPT = """
|
|
23 |
4. Add a shot content summary of the bottom level section.
|
24 |
|
25 |
- Output requirement:
|
|
|
26 |
- Always try to maximize the number of sub-sections.
|
27 |
- In language of 'Text'
|
28 |
- MUST IN FORMAT OF MARKDOWN
|
|
|
23 |
4. Add a shot content summary of the bottom level section.
|
24 |
|
25 |
- Output requirement:
|
26 |
+
- Generate at least 4 levels.
|
27 |
- Always try to maximize the number of sub-sections.
|
28 |
- In language of 'Text'
|
29 |
- MUST IN FORMAT OF MARKDOWN
|