| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import pathlib | 
					
					
						
						| 
							 | 
						import re | 
					
					
						
						| 
							 | 
						import warnings | 
					
					
						
						| 
							 | 
						from functools import partial | 
					
					
						
						| 
							 | 
						from io import BytesIO | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from elasticsearch_dsl import Q | 
					
					
						
						| 
							 | 
						from flask import request, send_file | 
					
					
						
						| 
							 | 
						from flask_login import login_required, current_user | 
					
					
						
						| 
							 | 
						from httpx import HTTPError | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from api.contants import NAME_LENGTH_LIMIT | 
					
					
						
						| 
							 | 
						from api.db import FileType, ParserType, FileSource, TaskStatus | 
					
					
						
						| 
							 | 
						from api.db import StatusEnum | 
					
					
						
						| 
							 | 
						from api.db.db_models import File | 
					
					
						
						| 
							 | 
						from api.db.services import duplicate_name | 
					
					
						
						| 
							 | 
						from api.db.services.document_service import DocumentService | 
					
					
						
						| 
							 | 
						from api.db.services.file2document_service import File2DocumentService | 
					
					
						
						| 
							 | 
						from api.db.services.file_service import FileService | 
					
					
						
						| 
							 | 
						from api.db.services.knowledgebase_service import KnowledgebaseService | 
					
					
						
						| 
							 | 
						from api.db.services.user_service import TenantService | 
					
					
						
						| 
							 | 
						from api.settings import RetCode | 
					
					
						
						| 
							 | 
						from api.utils import get_uuid | 
					
					
						
						| 
							 | 
						from api.utils.api_utils import construct_json_result, construct_error_response | 
					
					
						
						| 
							 | 
						from api.utils.api_utils import construct_result, validate_request | 
					
					
						
						| 
							 | 
						from api.utils.file_utils import filename_type, thumbnail | 
					
					
						
						| 
							 | 
						from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email | 
					
					
						
						| 
							 | 
						from rag.nlp import search | 
					
					
						
						| 
							 | 
						from rag.utils.es_conn import ELASTICSEARCH | 
					
					
						
						| 
							 | 
						from rag.utils.storage_factory import STORAGE_IMPL | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						MAXIMUM_OF_UPLOADING_FILES = 256 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@manager.route("/", methods=["POST"]) | 
					
					
						
						| 
							 | 
						@login_required   | 
					
					
						
						| 
							 | 
						@validate_request("name")   | 
					
					
						
						| 
							 | 
						def create_dataset(): | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    authorization_token = request.headers.get("Authorization") | 
					
					
						
						| 
							 | 
						    if not authorization_token: | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.AUTHENTICATION_ERROR, message="Authorization header is missing.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    tenant_id = current_user.id | 
					
					
						
						| 
							 | 
						    request_body = request.json | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if "name" not in request_body: | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.DATA_ERROR, message="Expected 'name' field in request body") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    dataset_name = request_body["name"] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if not dataset_name: | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.DATA_ERROR, message="Empty dataset name") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    dataset_name = dataset_name.strip() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    dataset_name_length = len(dataset_name) | 
					
					
						
						| 
							 | 
						    if dataset_name_length > NAME_LENGTH_LIMIT: | 
					
					
						
						| 
							 | 
						        return construct_json_result( | 
					
					
						
						| 
							 | 
						            code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						            message=f"Dataset name: {dataset_name} with length {dataset_name_length} exceeds {NAME_LENGTH_LIMIT}!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if len(request_body.keys()) > 1: | 
					
					
						
						| 
							 | 
						        name_list = [] | 
					
					
						
						| 
							 | 
						        for key_name in request_body.keys(): | 
					
					
						
						| 
							 | 
						            if key_name != "name": | 
					
					
						
						| 
							 | 
						                name_list.append(key_name) | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                     message=f"fields: {name_list}, are not allowed in request body.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    request_body["name"] = duplicate_name( | 
					
					
						
						| 
							 | 
						        KnowledgebaseService.query, | 
					
					
						
						| 
							 | 
						        name=dataset_name, | 
					
					
						
						| 
							 | 
						        tenant_id=tenant_id, | 
					
					
						
						| 
							 | 
						        status=StatusEnum.VALID.value) | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        request_body["id"] = get_uuid() | 
					
					
						
						| 
							 | 
						        request_body["tenant_id"] = tenant_id | 
					
					
						
						| 
							 | 
						        request_body["created_by"] = tenant_id | 
					
					
						
						| 
							 | 
						        exist, t = TenantService.get_by_id(tenant_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_result(code=RetCode.AUTHENTICATION_ERROR, message="Tenant not found.") | 
					
					
						
						| 
							 | 
						        request_body["embd_id"] = t.embd_id | 
					
					
						
						| 
							 | 
						        if not KnowledgebaseService.save(**request_body): | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            return construct_result() | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.SUCCESS, | 
					
					
						
						| 
							 | 
						                                     data={"dataset_name": request_body["name"], "dataset_id": request_body["id"]}) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@manager.route("/", methods=["GET"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def list_datasets(): | 
					
					
						
						| 
							 | 
						    offset = request.args.get("offset", 0) | 
					
					
						
						| 
							 | 
						    count = request.args.get("count", -1) | 
					
					
						
						| 
							 | 
						    orderby = request.args.get("orderby", "create_time") | 
					
					
						
						| 
							 | 
						    desc = request.args.get("desc", True) | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        tenants = TenantService.get_joined_tenants_by_user_id(current_user.id) | 
					
					
						
						| 
							 | 
						        datasets = KnowledgebaseService.get_by_tenant_ids_by_offset( | 
					
					
						
						| 
							 | 
						            [m["tenant_id"] for m in tenants], current_user.id, int(offset), int(count), orderby, desc) | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=datasets, code=RetCode.SUCCESS, message=f"List datasets successfully!") | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						    except HTTPError as http_err: | 
					
					
						
						| 
							 | 
						        return construct_json_result(http_err) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>", methods=["DELETE"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def remove_dataset(dataset_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        datasets = KnowledgebaseService.query(created_by=current_user.id, id=dataset_id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not datasets: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"The dataset cannot be found for your current account.", | 
					
					
						
						| 
							 | 
						                                         code=RetCode.OPERATING_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for doc in DocumentService.query(kb_id=dataset_id): | 
					
					
						
						| 
							 | 
						            if not DocumentService.remove_document(doc, datasets[0].tenant_id): | 
					
					
						
						| 
							 | 
						                 | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                             message="There was an error during the document removal process. " | 
					
					
						
						| 
							 | 
						                                                     "Please check the status of the RAGFlow server and try the removal again.") | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            f2d = File2DocumentService.get_by_document_id(doc.id) | 
					
					
						
						| 
							 | 
						            FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | 
					
					
						
						| 
							 | 
						            File2DocumentService.delete_by_document_id(doc.id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not KnowledgebaseService.delete_by_id(dataset_id): | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message="There was an error during the dataset removal process. " | 
					
					
						
						| 
							 | 
						                                                 "Please check the status of the RAGFlow server and try the removal again.") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully") | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>", methods=["GET"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def get_dataset(dataset_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        dataset = KnowledgebaseService.get_detail(dataset_id) | 
					
					
						
						| 
							 | 
						        if not dataset: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, message="Can't find this dataset!") | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=dataset, code=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_json_result(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>", methods=["PUT"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def update_dataset(dataset_id): | 
					
					
						
						| 
							 | 
						    req = request.json | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not req: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, message="Please input at least one parameter that " | 
					
					
						
						| 
							 | 
						                                                                          "you want to update!") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not KnowledgebaseService.query(created_by=current_user.id, id=dataset_id): | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"Only the owner of knowledgebase is authorized for this operation!", | 
					
					
						
						| 
							 | 
						                                         code=RetCode.OPERATING_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, message="This dataset cannot be found!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if "name" in req: | 
					
					
						
						| 
							 | 
						            name = req["name"].strip() | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if name.lower() != dataset.name.lower() \ | 
					
					
						
						| 
							 | 
						                    and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id, | 
					
					
						
						| 
							 | 
						                                                       status=StatusEnum.VALID.value)) > 1: | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                             message=f"The name: {name.lower()} is already used by other " | 
					
					
						
						| 
							 | 
						                                                     f"datasets. Please choose a different name.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        dataset_updating_data = {} | 
					
					
						
						| 
							 | 
						        chunk_num = req.get("chunk_num") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if req.get("embedding_model_id"): | 
					
					
						
						| 
							 | 
						            if chunk_num == 0: | 
					
					
						
						| 
							 | 
						                dataset_updating_data["embd_id"] = req["embedding_model_id"] | 
					
					
						
						| 
							 | 
						            else: | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                             message="You have already parsed the document in this " | 
					
					
						
						| 
							 | 
						                                                     "dataset, so you cannot change the embedding " | 
					
					
						
						| 
							 | 
						                                                     "model.") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if "chunk_method" in req: | 
					
					
						
						| 
							 | 
						            type_value = req["chunk_method"] | 
					
					
						
						| 
							 | 
						            if is_illegal_value_for_enum(type_value, ParserType): | 
					
					
						
						| 
							 | 
						                return construct_json_result(message=f"Illegal value {type_value} for 'chunk_method' field.", | 
					
					
						
						| 
							 | 
						                                             code=RetCode.DATA_ERROR) | 
					
					
						
						| 
							 | 
						            if chunk_num != 0: | 
					
					
						
						| 
							 | 
						                construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document " | 
					
					
						
						| 
							 | 
						                                                                       "in this dataset, so you cannot " | 
					
					
						
						| 
							 | 
						                                                                       "change the chunk method.") | 
					
					
						
						| 
							 | 
						            dataset_updating_data["parser_id"] = req["template_type"] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if req.get("photo"): | 
					
					
						
						| 
							 | 
						            dataset_updating_data["avatar"] = req["photo"] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if "layout_recognize" in req: | 
					
					
						
						| 
							 | 
						            if "parser_config" not in dataset_updating_data: | 
					
					
						
						| 
							 | 
						                dataset_updating_data['parser_config'] = {} | 
					
					
						
						| 
							 | 
						            dataset_updating_data['parser_config']['layout_recognize'] = req['layout_recognize'] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for key in ["name", "language", "description", "permission", "id", "token_num"]: | 
					
					
						
						| 
							 | 
						            if key in req: | 
					
					
						
						| 
							 | 
						                dataset_updating_data[key] = req.get(key) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not KnowledgebaseService.update_by_id(dataset.id, dataset_updating_data): | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.OPERATING_ERROR, message="Failed to update! " | 
					
					
						
						| 
							 | 
						                                                                               "Please check the status of RAGFlow " | 
					
					
						
						| 
							 | 
						                                                                               "server and try again!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        exist, dataset = KnowledgebaseService.get_by_id(dataset.id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, message="Failed to get the dataset " | 
					
					
						
						| 
							 | 
						                                                                          "using the dataset ID.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/", methods=["POST"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def upload_documents(dataset_id): | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if not request.files: | 
					
					
						
						| 
							 | 
						        return construct_json_result( | 
					
					
						
						| 
							 | 
						            message="There is no file!", code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    file_objs = request.files.getlist("file") | 
					
					
						
						| 
							 | 
						    num_file_objs = len(file_objs) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " | 
					
					
						
						| 
							 | 
						                                                                      f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						    if not exist: | 
					
					
						
						| 
							 | 
						        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    for file_obj in file_objs: | 
					
					
						
						| 
							 | 
						        file_name = file_obj.filename | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not file_name: | 
					
					
						
						| 
							 | 
						            return construct_json_result( | 
					
					
						
						| 
							 | 
						                message="There is a file without name!", code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if 'http' in file_name: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    root_folder = FileService.get_root_folder(current_user.id) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    parent_file_id = root_folder["id"]   | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    kb_root_folder = FileService.get_kb_folder(current_user.id) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    err = [] | 
					
					
						
						| 
							 | 
						    MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0)) | 
					
					
						
						| 
							 | 
						    uploaded_docs_json = [] | 
					
					
						
						| 
							 | 
						    for file in file_objs: | 
					
					
						
						| 
							 | 
						        try: | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                             message="Exceed the maximum file number of a free user!") | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            filename = duplicate_name( | 
					
					
						
						| 
							 | 
						                DocumentService.query, | 
					
					
						
						| 
							 | 
						                name=file.filename, | 
					
					
						
						| 
							 | 
						                kb_id=dataset.id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            filetype = filename_type(filename) | 
					
					
						
						| 
							 | 
						            if filetype == FileType.OTHER.value: | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                             message="This type of file has not been supported yet!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            location = filename | 
					
					
						
						| 
							 | 
						            while STORAGE_IMPL.obj_exist(dataset_id, location): | 
					
					
						
						| 
							 | 
						                location += "_" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            blob = file.read() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if blob == b'': | 
					
					
						
						| 
							 | 
						                warnings.warn(f"[WARNING]: The content of the file {filename} is empty.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            STORAGE_IMPL.put(dataset_id, location, blob) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            doc = { | 
					
					
						
						| 
							 | 
						                "id": get_uuid(), | 
					
					
						
						| 
							 | 
						                "kb_id": dataset.id, | 
					
					
						
						| 
							 | 
						                "parser_id": dataset.parser_id, | 
					
					
						
						| 
							 | 
						                "parser_config": dataset.parser_config, | 
					
					
						
						| 
							 | 
						                "created_by": current_user.id, | 
					
					
						
						| 
							 | 
						                "type": filetype, | 
					
					
						
						| 
							 | 
						                "name": filename, | 
					
					
						
						| 
							 | 
						                "location": location, | 
					
					
						
						| 
							 | 
						                "size": len(blob), | 
					
					
						
						| 
							 | 
						                "thumbnail": thumbnail(filename, blob) | 
					
					
						
						| 
							 | 
						            } | 
					
					
						
						| 
							 | 
						            if doc["type"] == FileType.VISUAL: | 
					
					
						
						| 
							 | 
						                doc["parser_id"] = ParserType.PICTURE.value | 
					
					
						
						| 
							 | 
						            if doc["type"] == FileType.AURAL: | 
					
					
						
						| 
							 | 
						                doc["parser_id"] = ParserType.AUDIO.value | 
					
					
						
						| 
							 | 
						            if re.search(r"\.(ppt|pptx|pages)$", filename): | 
					
					
						
						| 
							 | 
						                doc["parser_id"] = ParserType.PRESENTATION.value | 
					
					
						
						| 
							 | 
						            if re.search(r"\.(eml)$", filename): | 
					
					
						
						| 
							 | 
						                doc["parser_id"] = ParserType.EMAIL.value | 
					
					
						
						| 
							 | 
						            DocumentService.insert(doc) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) | 
					
					
						
						| 
							 | 
						            uploaded_docs_json.append(doc) | 
					
					
						
						| 
							 | 
						        except Exception as e: | 
					
					
						
						| 
							 | 
						            err.append(file.filename + ": " + str(e)) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    if err: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/<document_id>", methods=["DELETE"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def delete_document(document_id, dataset_id):   | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    root_folder = FileService.get_root_folder(current_user.id) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    parent_file_id = root_folder["id"] | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    errors = "" | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, doc = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        tenant_id = DocumentService.get_tenant_id(document_id) | 
					
					
						
						| 
							 | 
						        if not tenant_id: | 
					
					
						
						| 
							 | 
						            return construct_json_result( | 
					
					
						
						| 
							 | 
						                message=f"You cannot delete this document {document_id} due to the authorization" | 
					
					
						
						| 
							 | 
						                        f" reason!", code=RetCode.AUTHENTICATION_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        real_dataset_id, location = File2DocumentService.get_storage_address(doc_id=document_id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if real_dataset_id != dataset_id: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " | 
					
					
						
						| 
							 | 
						                                                 f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not DocumentService.remove_document(doc, tenant_id): | 
					
					
						
						| 
							 | 
						            return construct_json_result( | 
					
					
						
						| 
							 | 
						                message="There was an error during the document removal process. Please check the status of the " | 
					
					
						
						| 
							 | 
						                        "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        file_to_doc = File2DocumentService.get_by_document_id(document_id) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        File2DocumentService.delete_by_document_id(document_id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        STORAGE_IMPL.rm(dataset_id, location) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        errors += str(e) | 
					
					
						
						| 
							 | 
						    if errors: | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    return construct_json_result(data=True, code=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route('/<dataset_id>/documents/', methods=['GET']) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def list_documents(dataset_id): | 
					
					
						
						| 
							 | 
						    if not dataset_id: | 
					
					
						
						| 
							 | 
						        return construct_json_result( | 
					
					
						
						| 
							 | 
						            data=False, message="Lack of 'dataset_id'", code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    keywords = request.args.get("keywords", "") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    offset = request.args.get("offset", 0) | 
					
					
						
						| 
							 | 
						    count = request.args.get("count", -1) | 
					
					
						
						| 
							 | 
						    order_by = request.args.get("order_by", "create_time") | 
					
					
						
						| 
							 | 
						    descend = request.args.get("descend", True) | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by, | 
					
					
						
						| 
							 | 
						                                                                descend, keywords) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/<document_id>", methods=["PUT"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def update_document(dataset_id, document_id): | 
					
					
						
						| 
							 | 
						    req = request.json | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        legal_parameters = set() | 
					
					
						
						| 
							 | 
						        legal_parameters.add("name") | 
					
					
						
						| 
							 | 
						        legal_parameters.add("enable") | 
					
					
						
						| 
							 | 
						        legal_parameters.add("template_type") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        for key in req.keys(): | 
					
					
						
						| 
							 | 
						            if key not in legal_parameters: | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.ARGUMENT_ERROR, message=f"{key} is an illegal parameter.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not req: | 
					
					
						
						| 
							 | 
						            return construct_json_result( | 
					
					
						
						| 
							 | 
						                code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                message="Please input at least one parameter that you want to update!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset {dataset_id} cannot be found!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, document = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"This document {document_id} cannot be found!", | 
					
					
						
						| 
							 | 
						                                         code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        updating_data = {} | 
					
					
						
						| 
							 | 
						        if "name" in req: | 
					
					
						
						| 
							 | 
						            new_name = req["name"] | 
					
					
						
						| 
							 | 
						            updating_data["name"] = new_name | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if not new_name: | 
					
					
						
						| 
							 | 
						                return construct_json_result(code=RetCode.DATA_ERROR, message="There is no new name.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            new_name = new_name.strip() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if pathlib.Path(new_name.lower()).suffix != pathlib.Path( | 
					
					
						
						| 
							 | 
						                    document.name.lower()).suffix: | 
					
					
						
						| 
							 | 
						                return construct_json_result( | 
					
					
						
						| 
							 | 
						                    data=False, | 
					
					
						
						| 
							 | 
						                    message="The extension of file cannot be changed", | 
					
					
						
						| 
							 | 
						                    code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            for d in DocumentService.query(name=new_name, kb_id=document.kb_id): | 
					
					
						
						| 
							 | 
						                if d.name == new_name: | 
					
					
						
						| 
							 | 
						                    return construct_json_result( | 
					
					
						
						| 
							 | 
						                        message="Duplicated document name in the same dataset.", | 
					
					
						
						| 
							 | 
						                        code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if "enable" in req: | 
					
					
						
						| 
							 | 
						            enable_value = req["enable"] | 
					
					
						
						| 
							 | 
						            if is_illegal_value_for_enum(enable_value, StatusEnum): | 
					
					
						
						| 
							 | 
						                return construct_json_result(message=f"Illegal value {enable_value} for 'enable' field.", | 
					
					
						
						| 
							 | 
						                                             code=RetCode.DATA_ERROR) | 
					
					
						
						| 
							 | 
						            updating_data["status"] = enable_value | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if "template_type" in req: | 
					
					
						
						| 
							 | 
						            type_value = req["template_type"] | 
					
					
						
						| 
							 | 
						            if is_illegal_value_for_enum(type_value, ParserType): | 
					
					
						
						| 
							 | 
						                return construct_json_result(message=f"Illegal value {type_value} for 'template_type' field.", | 
					
					
						
						| 
							 | 
						                                             code=RetCode.DATA_ERROR) | 
					
					
						
						| 
							 | 
						            updating_data["parser_id"] = req["template_type"] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not DocumentService.update_by_id(document_id, updating_data): | 
					
					
						
						| 
							 | 
						            return construct_json_result( | 
					
					
						
						| 
							 | 
						                code=RetCode.OPERATING_ERROR, | 
					
					
						
						| 
							 | 
						                message="Failed to update document in the database! " | 
					
					
						
						| 
							 | 
						                        "Please check the status of RAGFlow server and try again!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if "name" in req: | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            file_information = File2DocumentService.get_by_document_id(document_id) | 
					
					
						
						| 
							 | 
						            if file_information: | 
					
					
						
						| 
							 | 
						                exist, file = FileService.get_by_id(file_information[0].file_id) | 
					
					
						
						| 
							 | 
						                FileService.update_by_id(file.id, {"name": req["name"]}) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        exist, document = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=document.to_json(), message="Success", code=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def is_illegal_value_for_enum(value, enum_class): | 
					
					
						
						| 
							 | 
						    return value not in enum_class.__members__.values() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def download_document(dataset_id, document_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, _ = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This dataset '{dataset_id}' cannot be found!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, document = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"This document '{document_id}' cannot be found!", | 
					
					
						
						| 
							 | 
						                                         code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id)   | 
					
					
						
						| 
							 | 
						        file_stream = STORAGE_IMPL.get(doc_id, doc_location) | 
					
					
						
						| 
							 | 
						        if not file_stream: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        file = BytesIO(file_stream) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        return send_file( | 
					
					
						
						| 
							 | 
						            file, | 
					
					
						
						| 
							 | 
						            as_attachment=True, | 
					
					
						
						| 
							 | 
						            download_name=document.name, | 
					
					
						
						| 
							 | 
						            mimetype='application/octet-stream'   | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def doc_parse_callback(doc_id, prog=None, msg=""): | 
					
					
						
						| 
							 | 
						    cancel = DocumentService.do_cancel(doc_id) | 
					
					
						
						| 
							 | 
						    if cancel: | 
					
					
						
						| 
							 | 
						        raise Exception("The parsing process has been cancelled!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id): | 
					
					
						
						| 
							 | 
						    match parser_name: | 
					
					
						
						| 
							 | 
						        case "book": | 
					
					
						
						| 
							 | 
						            book.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "laws": | 
					
					
						
						| 
							 | 
						            laws.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "manual": | 
					
					
						
						| 
							 | 
						            manual.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "naive": | 
					
					
						
						| 
							 | 
						            # It's the mode by default, which is general in the front-end | 
					
					
						
						| 
							 | 
						            naive.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "one": | 
					
					
						
						| 
							 | 
						            one.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "paper": | 
					
					
						
						| 
							 | 
						            paper.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "picture": | 
					
					
						
						| 
							 | 
						            picture.chunk(doc_name, binary=binary, tenant_id=tenant_id, lang="Chinese", | 
					
					
						
						| 
							 | 
						                          callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "presentation": | 
					
					
						
						| 
							 | 
						            presentation.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "qa": | 
					
					
						
						| 
							 | 
						            qa.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "resume": | 
					
					
						
						| 
							 | 
						            resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "table": | 
					
					
						
						| 
							 | 
						            table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "audio": | 
					
					
						
						| 
							 | 
						            audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case "email": | 
					
					
						
						| 
							 | 
						            email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) | 
					
					
						
						| 
							 | 
						        case _: | 
					
					
						
						| 
							 | 
						            return False | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						    return True | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/<document_id>/status", methods=["POST"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def parse_document(dataset_id, document_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, _ = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This dataset '{dataset_id}' cannot be found!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return parsing_document_internal(document_id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/status", methods=["POST"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def parse_documents(dataset_id): | 
					
					
						
						| 
							 | 
						    doc_ids = request.json["doc_ids"] | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						        exist, _ = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This dataset '{dataset_id}' cannot be found!") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if not doc_ids: | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", | 
					
					
						
						| 
							 | 
						                                                                    True, "") | 
					
					
						
						| 
							 | 
						            doc_ids = [doc["id"] for doc in docs] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        message = "" | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for id in doc_ids: | 
					
					
						
						| 
							 | 
						            res = parsing_document_internal(id) | 
					
					
						
						| 
							 | 
						            res_body = res.json | 
					
					
						
						| 
							 | 
						            if res_body["code"] == RetCode.SUCCESS: | 
					
					
						
						| 
							 | 
						                message += res_body["message"] | 
					
					
						
						| 
							 | 
						            else: | 
					
					
						
						| 
							 | 
						                return res | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=True, code=RetCode.SUCCESS, message=message) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def parsing_document_internal(id): | 
					
					
						
						| 
							 | 
						    message = "" | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, document = DocumentService.get_by_id(id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"This document '{id}' cannot be found!", | 
					
					
						
						| 
							 | 
						                                         code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        tenant_id = DocumentService.get_tenant_id(id) | 
					
					
						
						| 
							 | 
						        if not tenant_id: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        info = {"run": "1", "progress": 0} | 
					
					
						
						| 
							 | 
						        info["progress_msg"] = "" | 
					
					
						
						| 
							 | 
						        info["chunk_num"] = 0 | 
					
					
						
						| 
							 | 
						        info["token_num"] = 0 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        DocumentService.update_by_id(id, info) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        _, doc_attributes = DocumentService.get_by_id(id) | 
					
					
						
						| 
							 | 
						        doc_attributes = doc_attributes.to_dict() | 
					
					
						
						| 
							 | 
						        doc_id = doc_attributes["id"] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        bucket, doc_name = File2DocumentService.get_storage_address(doc_id=doc_id) | 
					
					
						
						| 
							 | 
						        binary = STORAGE_IMPL.get(bucket, doc_name) | 
					
					
						
						| 
							 | 
						        parser_name = doc_attributes["parser_id"] | 
					
					
						
						| 
							 | 
						        if binary: | 
					
					
						
						| 
							 | 
						            res = doc_parse(binary, doc_name, parser_name, tenant_id, doc_id) | 
					
					
						
						| 
							 | 
						            if res is False: | 
					
					
						
						| 
							 | 
						                message += f"The parser id: {parser_name} of the document {doc_id} is not supported; " | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            message += f"Empty data in the document: {doc_name}; " | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if doc_attributes["status"] == TaskStatus.FAIL.value: | 
					
					
						
						| 
							 | 
						            message += f"Failed in parsing the document: {doc_id}; " | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.SUCCESS, message=message) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("<dataset_id>/documents/<document_id>/status", methods=["DELETE"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def stop_parsing_document(dataset_id, document_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, _ = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This dataset '{dataset_id}' cannot be found!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return stop_parsing_document_internal(document_id) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("<dataset_id>/documents/status", methods=["DELETE"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def stop_parsing_documents(dataset_id): | 
					
					
						
						| 
							 | 
						    doc_ids = request.json["doc_ids"] | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, _ = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This dataset '{dataset_id}' cannot be found!") | 
					
					
						
						| 
							 | 
						        if not doc_ids: | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", | 
					
					
						
						| 
							 | 
						                                                                        True, "") | 
					
					
						
						| 
							 | 
						            doc_ids = [doc["id"] for doc in docs] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        message = "" | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for id in doc_ids: | 
					
					
						
						| 
							 | 
						            res = stop_parsing_document_internal(id) | 
					
					
						
						| 
							 | 
						            res_body = res.json | 
					
					
						
						| 
							 | 
						            if res_body["code"] == RetCode.SUCCESS: | 
					
					
						
						| 
							 | 
						                message += res_body["message"] | 
					
					
						
						| 
							 | 
						            else: | 
					
					
						
						| 
							 | 
						                return res | 
					
					
						
						| 
							 | 
						        return construct_json_result(data=True, code=RetCode.SUCCESS, message=message) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						def stop_parsing_document_internal(document_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, doc = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(message=f"This document '{document_id}' cannot be found!", | 
					
					
						
						| 
							 | 
						                                         code=RetCode.ARGUMENT_ERROR) | 
					
					
						
						| 
							 | 
						        doc_attributes = doc.to_dict() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if doc_attributes["status"] == TaskStatus.RUNNING.value: | 
					
					
						
						| 
							 | 
						            tenant_id = DocumentService.get_tenant_id(document_id) | 
					
					
						
						| 
							 | 
						            if not tenant_id: | 
					
					
						
						| 
							 | 
						                return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if not DocumentService.update_by_id(document_id, {"status": "2"}):   | 
					
					
						
						| 
							 | 
						                return construct_json_result( | 
					
					
						
						| 
							 | 
						                    code=RetCode.OPERATING_ERROR, | 
					
					
						
						| 
							 | 
						                    message="There was an error during the stopping parsing the document process. " | 
					
					
						
						| 
							 | 
						                            "Please check the status of the RAGFlow server and try the update again." | 
					
					
						
						| 
							 | 
						                ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            _, doc_attributes = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						            doc_attributes = doc_attributes.to_dict() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if doc_attributes["status"] == TaskStatus.RUNNING.value: | 
					
					
						
						| 
							 | 
						                return construct_json_result(message=f"Failed in parsing the document: {document_id}; ", code=RetCode.SUCCESS) | 
					
					
						
						| 
							 | 
						        return construct_json_result(code=RetCode.SUCCESS, message="") | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						@manager.route("/<dataset_id>/documents/<document_id>/status", methods=["GET"]) | 
					
					
						
						| 
							 | 
						@login_required | 
					
					
						
						| 
							 | 
						def show_parsing_status(dataset_id, document_id): | 
					
					
						
						| 
							 | 
						    try: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, _ = KnowledgebaseService.get_by_id(dataset_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This dataset: '{dataset_id}' cannot be found!") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        exist, _ = DocumentService.get_by_id(document_id) | 
					
					
						
						| 
							 | 
						        if not exist: | 
					
					
						
						| 
							 | 
						            return construct_json_result(code=RetCode.DATA_ERROR, | 
					
					
						
						| 
							 | 
						                                         message=f"This document: '{document_id}' is not a valid document.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        _, doc = DocumentService.get_by_id(document_id)   | 
					
					
						
						| 
							 | 
						        doc_attributes = doc.to_dict() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return construct_json_result( | 
					
					
						
						| 
							 | 
						            data={"progress": doc_attributes["progress"], "status": TaskStatus(doc_attributes["status"]).name}, | 
					
					
						
						| 
							 | 
						            code=RetCode.SUCCESS | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						    except Exception as e: | 
					
					
						
						| 
							 | 
						        return construct_error_response(e) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 |