Kevin Hu
commited on
Commit
·
36b9967
1
Parent(s):
05c4835
refine upload & parse (#1969)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- api/apps/api_app.py +24 -2
- api/apps/document_app.py +5 -124
- api/db/services/document_service.py +143 -1
- api/db/services/file_service.py +5 -5
api/apps/api_app.py
CHANGED
|
@@ -26,7 +26,7 @@ from api.db.db_models import APIToken, API4Conversation, Task, File
|
|
| 26 |
from api.db.services import duplicate_name
|
| 27 |
from api.db.services.api_service import APITokenService, API4ConversationService
|
| 28 |
from api.db.services.dialog_service import DialogService, chat
|
| 29 |
-
from api.db.services.document_service import DocumentService
|
| 30 |
from api.db.services.file2document_service import File2DocumentService
|
| 31 |
from api.db.services.file_service import FileService
|
| 32 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
@@ -470,6 +470,29 @@ def upload():
|
|
| 470 |
return get_json_result(data=doc_result.to_json())
|
| 471 |
|
| 472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
@manager.route('/list_chunks', methods=['POST'])
|
| 474 |
# @login_required
|
| 475 |
def list_chunks():
|
|
@@ -560,7 +583,6 @@ def document_rm():
|
|
| 560 |
|
| 561 |
tenant_id = objs[0].tenant_id
|
| 562 |
req = request.json
|
| 563 |
-
doc_ids = []
|
| 564 |
try:
|
| 565 |
doc_ids = [DocumentService.get_doc_id_by_doc_name(doc_name) for doc_name in req.get("doc_names", [])]
|
| 566 |
for doc_id in req.get("doc_ids", []):
|
|
|
|
| 26 |
from api.db.services import duplicate_name
|
| 27 |
from api.db.services.api_service import APITokenService, API4ConversationService
|
| 28 |
from api.db.services.dialog_service import DialogService, chat
|
| 29 |
+
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
| 30 |
from api.db.services.file2document_service import File2DocumentService
|
| 31 |
from api.db.services.file_service import FileService
|
| 32 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
|
|
| 470 |
return get_json_result(data=doc_result.to_json())
|
| 471 |
|
| 472 |
|
| 473 |
+
@manager.route('/document/upload_and_parse', methods=['POST'])
|
| 474 |
+
@validate_request("conversation_id")
|
| 475 |
+
def upload_parse():
|
| 476 |
+
token = request.headers.get('Authorization').split()[1]
|
| 477 |
+
objs = APIToken.query(token=token)
|
| 478 |
+
if not objs:
|
| 479 |
+
return get_json_result(
|
| 480 |
+
data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)
|
| 481 |
+
|
| 482 |
+
if 'file' not in request.files:
|
| 483 |
+
return get_json_result(
|
| 484 |
+
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
| 485 |
+
|
| 486 |
+
file_objs = request.files.getlist('file')
|
| 487 |
+
for file_obj in file_objs:
|
| 488 |
+
if file_obj.filename == '':
|
| 489 |
+
return get_json_result(
|
| 490 |
+
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
| 491 |
+
|
| 492 |
+
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, objs[0].tenant_id)
|
| 493 |
+
return get_json_result(data=doc_ids)
|
| 494 |
+
|
| 495 |
+
|
| 496 |
@manager.route('/list_chunks', methods=['POST'])
|
| 497 |
# @login_required
|
| 498 |
def list_chunks():
|
|
|
|
| 583 |
|
| 584 |
tenant_id = objs[0].tenant_id
|
| 585 |
req = request.json
|
|
|
|
| 586 |
try:
|
| 587 |
doc_ids = [DocumentService.get_doc_id_by_doc_name(doc_name) for doc_name in req.get("doc_names", [])]
|
| 588 |
for doc_id in req.get("doc_ids", []):
|
api/apps/document_app.py
CHANGED
|
@@ -45,7 +45,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
| 45 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
| 46 |
from api.utils import get_uuid
|
| 47 |
from api.db import FileType, TaskStatus, ParserType, FileSource, LLMType
|
| 48 |
-
from api.db.services.document_service import DocumentService
|
| 49 |
from api.settings import RetCode, stat_logger
|
| 50 |
from api.utils.api_utils import get_json_result
|
| 51 |
from rag.utils.minio_conn import MINIO
|
|
@@ -75,7 +75,7 @@ def upload():
|
|
| 75 |
if not e:
|
| 76 |
raise LookupError("Can't find this knowledgebase!")
|
| 77 |
|
| 78 |
-
err, _ = FileService.upload_document(kb, file_objs)
|
| 79 |
if err:
|
| 80 |
return get_json_result(
|
| 81 |
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
|
@@ -212,7 +212,7 @@ def docinfos():
|
|
| 212 |
|
| 213 |
|
| 214 |
@manager.route('/thumbnails', methods=['GET'])
|
| 215 |
-
|
| 216 |
def thumbnails():
|
| 217 |
doc_ids = request.args.get("doc_ids").split(",")
|
| 218 |
if not doc_ids:
|
|
@@ -460,7 +460,6 @@ def get_image(image_id):
|
|
| 460 |
@login_required
|
| 461 |
@validate_request("conversation_id")
|
| 462 |
def upload_and_parse():
|
| 463 |
-
from rag.app import presentation, picture, naive, audio, email
|
| 464 |
if 'file' not in request.files:
|
| 465 |
return get_json_result(
|
| 466 |
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
|
@@ -471,124 +470,6 @@ def upload_and_parse():
|
|
| 471 |
return get_json_result(
|
| 472 |
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
| 473 |
|
| 474 |
-
|
| 475 |
-
if not e:
|
| 476 |
-
return get_data_error_result(retmsg="Conversation not found!")
|
| 477 |
-
e, dia = DialogService.get_by_id(conv.dialog_id)
|
| 478 |
-
kb_id = dia.kb_ids[0]
|
| 479 |
-
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
| 480 |
-
if not e:
|
| 481 |
-
raise LookupError("Can't find this knowledgebase!")
|
| 482 |
-
|
| 483 |
-
idxnm = search.index_name(kb.tenant_id)
|
| 484 |
-
if not ELASTICSEARCH.indexExist(idxnm):
|
| 485 |
-
ELASTICSEARCH.createIdx(idxnm, json.load(
|
| 486 |
-
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
| 487 |
-
|
| 488 |
-
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id, lang=kb.language)
|
| 489 |
-
|
| 490 |
-
err, files = FileService.upload_document(kb, file_objs)
|
| 491 |
-
if err:
|
| 492 |
-
return get_json_result(
|
| 493 |
-
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
| 494 |
-
|
| 495 |
-
def dummy(prog=None, msg=""):
|
| 496 |
-
pass
|
| 497 |
-
|
| 498 |
-
FACTORY = {
|
| 499 |
-
ParserType.PRESENTATION.value: presentation,
|
| 500 |
-
ParserType.PICTURE.value: picture,
|
| 501 |
-
ParserType.AUDIO.value: audio,
|
| 502 |
-
ParserType.EMAIL.value: email
|
| 503 |
-
}
|
| 504 |
-
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
| 505 |
-
exe = ThreadPoolExecutor(max_workers=12)
|
| 506 |
-
threads = []
|
| 507 |
-
for d, blob in files:
|
| 508 |
-
kwargs = {
|
| 509 |
-
"callback": dummy,
|
| 510 |
-
"parser_config": parser_config,
|
| 511 |
-
"from_page": 0,
|
| 512 |
-
"to_page": 100000,
|
| 513 |
-
"tenant_id": kb.tenant_id,
|
| 514 |
-
"lang": kb.language
|
| 515 |
-
}
|
| 516 |
-
threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
|
| 517 |
|
| 518 |
-
|
| 519 |
-
docs = []
|
| 520 |
-
doc = {
|
| 521 |
-
"doc_id": docinfo["id"],
|
| 522 |
-
"kb_id": [kb.id]
|
| 523 |
-
}
|
| 524 |
-
for ck in th.result():
|
| 525 |
-
d = deepcopy(doc)
|
| 526 |
-
d.update(ck)
|
| 527 |
-
md5 = hashlib.md5()
|
| 528 |
-
md5.update((ck["content_with_weight"] +
|
| 529 |
-
str(d["doc_id"])).encode("utf-8"))
|
| 530 |
-
d["_id"] = md5.hexdigest()
|
| 531 |
-
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 532 |
-
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 533 |
-
if not d.get("image"):
|
| 534 |
-
docs.append(d)
|
| 535 |
-
continue
|
| 536 |
-
|
| 537 |
-
output_buffer = BytesIO()
|
| 538 |
-
if isinstance(d["image"], bytes):
|
| 539 |
-
output_buffer = BytesIO(d["image"])
|
| 540 |
-
else:
|
| 541 |
-
d["image"].save(output_buffer, format='JPEG')
|
| 542 |
-
|
| 543 |
-
MINIO.put(kb.id, d["_id"], output_buffer.getvalue())
|
| 544 |
-
d["img_id"] = "{}-{}".format(kb.id, d["_id"])
|
| 545 |
-
del d["image"]
|
| 546 |
-
docs.append(d)
|
| 547 |
-
|
| 548 |
-
parser_ids = {d["id"]: d["parser_id"] for d, _ in files}
|
| 549 |
-
docids = [d["id"] for d, _ in files]
|
| 550 |
-
chunk_counts = {id: 0 for id in docids}
|
| 551 |
-
token_counts = {id: 0 for id in docids}
|
| 552 |
-
es_bulk_size = 64
|
| 553 |
-
|
| 554 |
-
def embedding(doc_id, cnts, batch_size=16):
|
| 555 |
-
nonlocal embd_mdl, chunk_counts, token_counts
|
| 556 |
-
vects = []
|
| 557 |
-
for i in range(0, len(cnts), batch_size):
|
| 558 |
-
vts, c = embd_mdl.encode(cnts[i: i + batch_size])
|
| 559 |
-
vects.extend(vts.tolist())
|
| 560 |
-
chunk_counts[doc_id] += len(cnts[i:i + batch_size])
|
| 561 |
-
token_counts[doc_id] += c
|
| 562 |
-
return vects
|
| 563 |
-
|
| 564 |
-
_, tenant = TenantService.get_by_id(kb.tenant_id)
|
| 565 |
-
llm_bdl = LLMBundle(kb.tenant_id, LLMType.CHAT, tenant.llm_id)
|
| 566 |
-
for doc_id in docids:
|
| 567 |
-
cks = [c for c in docs if c["doc_id"] == doc_id]
|
| 568 |
-
|
| 569 |
-
if False and parser_ids[doc_id] != ParserType.PICTURE.value:
|
| 570 |
-
mindmap = MindMapExtractor(llm_bdl)
|
| 571 |
-
try:
|
| 572 |
-
mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output, ensure_ascii=False, indent=2)
|
| 573 |
-
if len(mind_map) < 32: raise Exception("Few content: "+mind_map)
|
| 574 |
-
cks.append({
|
| 575 |
-
"doc_id": doc_id,
|
| 576 |
-
"kb_id": [kb.id],
|
| 577 |
-
"content_with_weight": mind_map,
|
| 578 |
-
"knowledge_graph_kwd": "mind_map"
|
| 579 |
-
})
|
| 580 |
-
except Exception as e:
|
| 581 |
-
stat_logger.error("Mind map generation error:", traceback.format_exc())
|
| 582 |
-
|
| 583 |
-
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
|
| 584 |
-
assert len(cks) == len(vects)
|
| 585 |
-
for i, d in enumerate(cks):
|
| 586 |
-
v = vects[i]
|
| 587 |
-
d["q_%d_vec" % len(v)] = v
|
| 588 |
-
for b in range(0, len(cks), es_bulk_size):
|
| 589 |
-
ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], idxnm)
|
| 590 |
-
|
| 591 |
-
DocumentService.increment_chunk_num(
|
| 592 |
-
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
|
| 593 |
-
|
| 594 |
-
return get_json_result(data=[d["id"] for d,_ in files])
|
|
|
|
| 45 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
| 46 |
from api.utils import get_uuid
|
| 47 |
from api.db import FileType, TaskStatus, ParserType, FileSource, LLMType
|
| 48 |
+
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
| 49 |
from api.settings import RetCode, stat_logger
|
| 50 |
from api.utils.api_utils import get_json_result
|
| 51 |
from rag.utils.minio_conn import MINIO
|
|
|
|
| 75 |
if not e:
|
| 76 |
raise LookupError("Can't find this knowledgebase!")
|
| 77 |
|
| 78 |
+
err, _ = FileService.upload_document(kb, file_objs, current_user.id)
|
| 79 |
if err:
|
| 80 |
return get_json_result(
|
| 81 |
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
|
|
|
| 212 |
|
| 213 |
|
| 214 |
@manager.route('/thumbnails', methods=['GET'])
|
| 215 |
+
#@login_required
|
| 216 |
def thumbnails():
|
| 217 |
doc_ids = request.args.get("doc_ids").split(",")
|
| 218 |
if not doc_ids:
|
|
|
|
| 460 |
@login_required
|
| 461 |
@validate_request("conversation_id")
|
| 462 |
def upload_and_parse():
|
|
|
|
| 463 |
if 'file' not in request.files:
|
| 464 |
return get_json_result(
|
| 465 |
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
| 470 |
return get_json_result(
|
| 471 |
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
| 472 |
|
| 473 |
+
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
+
return get_json_result(data=doc_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/db/services/document_service.py
CHANGED
|
@@ -13,20 +13,29 @@
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
|
|
|
|
|
|
|
|
|
| 16 |
import random
|
|
|
|
|
|
|
| 17 |
from datetime import datetime
|
|
|
|
|
|
|
| 18 |
from elasticsearch_dsl import Q
|
| 19 |
from peewee import fn
|
| 20 |
|
| 21 |
from api.db.db_utils import bulk_insert_into_db
|
| 22 |
from api.settings import stat_logger
|
| 23 |
from api.utils import current_timestamp, get_format_time, get_uuid
|
|
|
|
|
|
|
| 24 |
from rag.settings import SVR_QUEUE_NAME
|
| 25 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 26 |
from rag.utils.minio_conn import MINIO
|
| 27 |
from rag.nlp import search
|
| 28 |
|
| 29 |
-
from api.db import FileType, TaskStatus, ParserType
|
| 30 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
| 31 |
from api.db.db_models import Document
|
| 32 |
from api.db.services.common_service import CommonService
|
|
@@ -380,3 +389,136 @@ def queue_raptor_tasks(doc):
|
|
| 380 |
bulk_insert_into_db(Task, [task], True)
|
| 381 |
task["type"] = "raptor"
|
| 382 |
assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=task), "Can't access Redis. Please check the Redis' status."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
+
import hashlib
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
import random
|
| 20 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 21 |
+
from copy import deepcopy
|
| 22 |
from datetime import datetime
|
| 23 |
+
from io import BytesIO
|
| 24 |
+
|
| 25 |
from elasticsearch_dsl import Q
|
| 26 |
from peewee import fn
|
| 27 |
|
| 28 |
from api.db.db_utils import bulk_insert_into_db
|
| 29 |
from api.settings import stat_logger
|
| 30 |
from api.utils import current_timestamp, get_format_time, get_uuid
|
| 31 |
+
from api.utils.file_utils import get_project_base_directory
|
| 32 |
+
from graphrag.mind_map_extractor import MindMapExtractor
|
| 33 |
from rag.settings import SVR_QUEUE_NAME
|
| 34 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 35 |
from rag.utils.minio_conn import MINIO
|
| 36 |
from rag.nlp import search
|
| 37 |
|
| 38 |
+
from api.db import FileType, TaskStatus, ParserType, LLMType
|
| 39 |
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
| 40 |
from api.db.db_models import Document
|
| 41 |
from api.db.services.common_service import CommonService
|
|
|
|
| 389 |
bulk_insert_into_db(Task, [task], True)
|
| 390 |
task["type"] = "raptor"
|
| 391 |
assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=task), "Can't access Redis. Please check the Redis' status."
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
| 395 |
+
from rag.app import presentation, picture, naive, audio, email
|
| 396 |
+
from api.db.services.dialog_service import ConversationService, DialogService
|
| 397 |
+
from api.db.services.file_service import FileService
|
| 398 |
+
from api.db.services.llm_service import LLMBundle
|
| 399 |
+
from api.db.services.user_service import TenantService
|
| 400 |
+
from api.db.services.api_service import API4ConversationService
|
| 401 |
+
|
| 402 |
+
e, conv = ConversationService.get_by_id(conversation_id)
|
| 403 |
+
if not e:
|
| 404 |
+
e, conv = API4ConversationService.get_by_id(conversation_id)
|
| 405 |
+
assert e, "Conversation not found!"
|
| 406 |
+
|
| 407 |
+
e, dia = DialogService.get_by_id(conv.dialog_id)
|
| 408 |
+
kb_id = dia.kb_ids[0]
|
| 409 |
+
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
| 410 |
+
if not e:
|
| 411 |
+
raise LookupError("Can't find this knowledgebase!")
|
| 412 |
+
|
| 413 |
+
idxnm = search.index_name(kb.tenant_id)
|
| 414 |
+
if not ELASTICSEARCH.indexExist(idxnm):
|
| 415 |
+
ELASTICSEARCH.createIdx(idxnm, json.load(
|
| 416 |
+
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
| 417 |
+
|
| 418 |
+
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id, lang=kb.language)
|
| 419 |
+
|
| 420 |
+
err, files = FileService.upload_document(kb, file_objs, user_id)
|
| 421 |
+
assert not err, "\n".join(err)
|
| 422 |
+
|
| 423 |
+
def dummy(prog=None, msg=""):
|
| 424 |
+
pass
|
| 425 |
+
|
| 426 |
+
FACTORY = {
|
| 427 |
+
ParserType.PRESENTATION.value: presentation,
|
| 428 |
+
ParserType.PICTURE.value: picture,
|
| 429 |
+
ParserType.AUDIO.value: audio,
|
| 430 |
+
ParserType.EMAIL.value: email
|
| 431 |
+
}
|
| 432 |
+
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": False}
|
| 433 |
+
exe = ThreadPoolExecutor(max_workers=12)
|
| 434 |
+
threads = []
|
| 435 |
+
for d, blob in files:
|
| 436 |
+
kwargs = {
|
| 437 |
+
"callback": dummy,
|
| 438 |
+
"parser_config": parser_config,
|
| 439 |
+
"from_page": 0,
|
| 440 |
+
"to_page": 100000,
|
| 441 |
+
"tenant_id": kb.tenant_id,
|
| 442 |
+
"lang": kb.language
|
| 443 |
+
}
|
| 444 |
+
threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))
|
| 445 |
+
|
| 446 |
+
for (docinfo, _), th in zip(files, threads):
|
| 447 |
+
docs = []
|
| 448 |
+
doc = {
|
| 449 |
+
"doc_id": docinfo["id"],
|
| 450 |
+
"kb_id": [kb.id]
|
| 451 |
+
}
|
| 452 |
+
for ck in th.result():
|
| 453 |
+
d = deepcopy(doc)
|
| 454 |
+
d.update(ck)
|
| 455 |
+
md5 = hashlib.md5()
|
| 456 |
+
md5.update((ck["content_with_weight"] +
|
| 457 |
+
str(d["doc_id"])).encode("utf-8"))
|
| 458 |
+
d["_id"] = md5.hexdigest()
|
| 459 |
+
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 460 |
+
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 461 |
+
if not d.get("image"):
|
| 462 |
+
docs.append(d)
|
| 463 |
+
continue
|
| 464 |
+
|
| 465 |
+
output_buffer = BytesIO()
|
| 466 |
+
if isinstance(d["image"], bytes):
|
| 467 |
+
output_buffer = BytesIO(d["image"])
|
| 468 |
+
else:
|
| 469 |
+
d["image"].save(output_buffer, format='JPEG')
|
| 470 |
+
|
| 471 |
+
MINIO.put(kb.id, d["_id"], output_buffer.getvalue())
|
| 472 |
+
d["img_id"] = "{}-{}".format(kb.id, d["_id"])
|
| 473 |
+
del d["image"]
|
| 474 |
+
docs.append(d)
|
| 475 |
+
|
| 476 |
+
parser_ids = {d["id"]: d["parser_id"] for d, _ in files}
|
| 477 |
+
docids = [d["id"] for d, _ in files]
|
| 478 |
+
chunk_counts = {id: 0 for id in docids}
|
| 479 |
+
token_counts = {id: 0 for id in docids}
|
| 480 |
+
es_bulk_size = 64
|
| 481 |
+
|
| 482 |
+
def embedding(doc_id, cnts, batch_size=16):
|
| 483 |
+
nonlocal embd_mdl, chunk_counts, token_counts
|
| 484 |
+
vects = []
|
| 485 |
+
for i in range(0, len(cnts), batch_size):
|
| 486 |
+
vts, c = embd_mdl.encode(cnts[i: i + batch_size])
|
| 487 |
+
vects.extend(vts.tolist())
|
| 488 |
+
chunk_counts[doc_id] += len(cnts[i:i + batch_size])
|
| 489 |
+
token_counts[doc_id] += c
|
| 490 |
+
return vects
|
| 491 |
+
|
| 492 |
+
_, tenant = TenantService.get_by_id(kb.tenant_id)
|
| 493 |
+
llm_bdl = LLMBundle(kb.tenant_id, LLMType.CHAT, tenant.llm_id)
|
| 494 |
+
for doc_id in docids:
|
| 495 |
+
cks = [c for c in docs if c["doc_id"] == doc_id]
|
| 496 |
+
|
| 497 |
+
if parser_ids[doc_id] != ParserType.PICTURE.value:
|
| 498 |
+
mindmap = MindMapExtractor(llm_bdl)
|
| 499 |
+
try:
|
| 500 |
+
mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output,
|
| 501 |
+
ensure_ascii=False, indent=2)
|
| 502 |
+
if len(mind_map) < 32: raise Exception("Few content: " + mind_map)
|
| 503 |
+
cks.append({
|
| 504 |
+
"id": get_uuid(),
|
| 505 |
+
"doc_id": doc_id,
|
| 506 |
+
"kb_id": [kb.id],
|
| 507 |
+
"content_with_weight": mind_map,
|
| 508 |
+
"knowledge_graph_kwd": "mind_map"
|
| 509 |
+
})
|
| 510 |
+
except Exception as e:
|
| 511 |
+
stat_logger.error("Mind map generation error:", traceback.format_exc())
|
| 512 |
+
|
| 513 |
+
vects = embedding(doc_id, [c["content_with_weight"] for c in cks])
|
| 514 |
+
assert len(cks) == len(vects)
|
| 515 |
+
for i, d in enumerate(cks):
|
| 516 |
+
v = vects[i]
|
| 517 |
+
d["q_%d_vec" % len(v)] = v
|
| 518 |
+
for b in range(0, len(cks), es_bulk_size):
|
| 519 |
+
ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], idxnm)
|
| 520 |
+
|
| 521 |
+
DocumentService.increment_chunk_num(
|
| 522 |
+
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
|
| 523 |
+
|
| 524 |
+
return [d["id"] for d,_ in files]
|
api/db/services/file_service.py
CHANGED
|
@@ -327,11 +327,11 @@ class FileService(CommonService):
|
|
| 327 |
|
| 328 |
@classmethod
|
| 329 |
@DB.connection_context()
|
| 330 |
-
def upload_document(self, kb, file_objs):
|
| 331 |
-
root_folder = self.get_root_folder(
|
| 332 |
pf_id = root_folder["id"]
|
| 333 |
-
self.init_knowledgebase_docs(pf_id,
|
| 334 |
-
kb_root_folder = self.get_kb_folder(
|
| 335 |
kb_folder = self.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
| 336 |
|
| 337 |
err, files = [], []
|
|
@@ -359,7 +359,7 @@ class FileService(CommonService):
|
|
| 359 |
"kb_id": kb.id,
|
| 360 |
"parser_id": kb.parser_id,
|
| 361 |
"parser_config": kb.parser_config,
|
| 362 |
-
"created_by":
|
| 363 |
"type": filetype,
|
| 364 |
"name": filename,
|
| 365 |
"location": location,
|
|
|
|
| 327 |
|
| 328 |
@classmethod
|
| 329 |
@DB.connection_context()
|
| 330 |
+
def upload_document(self, kb, file_objs, user_id):
|
| 331 |
+
root_folder = self.get_root_folder(user_id)
|
| 332 |
pf_id = root_folder["id"]
|
| 333 |
+
self.init_knowledgebase_docs(pf_id, user_id)
|
| 334 |
+
kb_root_folder = self.get_kb_folder(user_id)
|
| 335 |
kb_folder = self.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
| 336 |
|
| 337 |
err, files = [], []
|
|
|
|
| 359 |
"kb_id": kb.id,
|
| 360 |
"parser_id": kb.parser_id,
|
| 361 |
"parser_config": kb.parser_config,
|
| 362 |
+
"created_by": user_id,
|
| 363 |
"type": filetype,
|
| 364 |
"name": filename,
|
| 365 |
"location": location,
|