Matej Horník
commited on
Commit
·
19e6d59
1
Parent(s):
3090a99
feat: docs for api endpoints to generate openapi specification (#3109)
Browse files### What problem does this PR solve?
**Added openapi specification for API routes. This creates swagger UI
similar to FastAPI to better use the API.**
Using python package `flasgger`
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
Not all routes are included since this is a work in progress.
Docs can be accessed on: `{host}:{port}/apidocs`
- api/apps/__init__.py +63 -19
- api/apps/sdk/dataset.py +379 -76
- api/apps/sdk/doc.py +891 -151
- api/apps/system_app.py +188 -26
- api/apps/user_app.py +400 -108
- api/ragflow_server.py +28 -11
- poetry.lock +100 -6
- pyproject.toml +2 -1
api/apps/__init__.py
CHANGED
|
@@ -21,6 +21,7 @@ from pathlib import Path
|
|
| 21 |
from flask import Blueprint, Flask
|
| 22 |
from werkzeug.wrappers.request import Request
|
| 23 |
from flask_cors import CORS
|
|
|
|
| 24 |
|
| 25 |
from api.db import StatusEnum
|
| 26 |
from api.db.db_models import close_connection
|
|
@@ -34,27 +35,62 @@ from api.settings import API_VERSION, access_logger
|
|
| 34 |
from api.utils.api_utils import server_error_response
|
| 35 |
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
| 36 |
|
| 37 |
-
__all__ = [
|
| 38 |
|
| 39 |
|
| 40 |
-
logger = logging.getLogger(
|
| 41 |
for h in access_logger.handlers:
|
| 42 |
logger.addHandler(h)
|
| 43 |
|
| 44 |
Request.json = property(lambda self: self.get_json(force=True, silent=True))
|
| 45 |
|
| 46 |
app = Flask(__name__)
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
app.url_map.strict_slashes = False
|
| 49 |
app.json_encoder = CustomJSONEncoder
|
| 50 |
app.errorhandler(Exception)(server_error_response)
|
| 51 |
|
| 52 |
|
| 53 |
## convince for dev and debug
|
| 54 |
-
#app.config["LOGIN_DISABLED"] = True
|
| 55 |
app.config["SESSION_PERMANENT"] = False
|
| 56 |
app.config["SESSION_TYPE"] = "filesystem"
|
| 57 |
-
app.config[
|
|
|
|
|
|
|
| 58 |
|
| 59 |
Session(app)
|
| 60 |
login_manager = LoginManager()
|
|
@@ -64,17 +100,23 @@ commands.register_commands(app)
|
|
| 64 |
|
| 65 |
|
| 66 |
def search_pages_path(pages_dir):
|
| 67 |
-
app_path_list = [
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
app_path_list.extend(api_path_list)
|
| 70 |
return app_path_list
|
| 71 |
|
| 72 |
|
| 73 |
def register_page(page_path):
|
| 74 |
-
path = f
|
| 75 |
|
| 76 |
-
page_name = page_path.stem.rstrip(
|
| 77 |
-
module_name =
|
|
|
|
|
|
|
| 78 |
|
| 79 |
spec = spec_from_file_location(module_name, page_path)
|
| 80 |
page = module_from_spec(spec)
|
|
@@ -82,8 +124,10 @@ def register_page(page_path):
|
|
| 82 |
page.manager = Blueprint(page_name, module_name)
|
| 83 |
sys.modules[module_name] = page
|
| 84 |
spec.loader.exec_module(page)
|
| 85 |
-
page_name = getattr(page,
|
| 86 |
-
url_prefix =
|
|
|
|
|
|
|
| 87 |
|
| 88 |
app.register_blueprint(page.manager, url_prefix=url_prefix)
|
| 89 |
return url_prefix
|
|
@@ -91,14 +135,12 @@ def register_page(page_path):
|
|
| 91 |
|
| 92 |
pages_dir = [
|
| 93 |
Path(__file__).parent,
|
| 94 |
-
Path(__file__).parent.parent /
|
| 95 |
-
Path(__file__).parent.parent /
|
| 96 |
]
|
| 97 |
|
| 98 |
client_urls_prefix = [
|
| 99 |
-
register_page(path)
|
| 100 |
-
for dir in pages_dir
|
| 101 |
-
for path in search_pages_path(dir)
|
| 102 |
]
|
| 103 |
|
| 104 |
|
|
@@ -109,7 +151,9 @@ def load_user(web_request):
|
|
| 109 |
if authorization:
|
| 110 |
try:
|
| 111 |
access_token = str(jwt.loads(authorization))
|
| 112 |
-
user = UserService.query(
|
|
|
|
|
|
|
| 113 |
if user:
|
| 114 |
return user[0]
|
| 115 |
else:
|
|
@@ -123,4 +167,4 @@ def load_user(web_request):
|
|
| 123 |
|
| 124 |
@app.teardown_request
|
| 125 |
def _db_close(exc):
|
| 126 |
-
close_connection()
|
|
|
|
| 21 |
from flask import Blueprint, Flask
|
| 22 |
from werkzeug.wrappers.request import Request
|
| 23 |
from flask_cors import CORS
|
| 24 |
+
from flasgger import Swagger
|
| 25 |
|
| 26 |
from api.db import StatusEnum
|
| 27 |
from api.db.db_models import close_connection
|
|
|
|
| 35 |
from api.utils.api_utils import server_error_response
|
| 36 |
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
| 37 |
|
| 38 |
+
__all__ = ["app"]
|
| 39 |
|
| 40 |
|
| 41 |
+
logger = logging.getLogger("flask.app")
|
| 42 |
for h in access_logger.handlers:
|
| 43 |
logger.addHandler(h)
|
| 44 |
|
| 45 |
Request.json = property(lambda self: self.get_json(force=True, silent=True))
|
| 46 |
|
| 47 |
app = Flask(__name__)
|
| 48 |
+
|
| 49 |
+
# Add this at the beginning of your file to configure Swagger UI
|
| 50 |
+
swagger_config = {
|
| 51 |
+
"headers": [],
|
| 52 |
+
"specs": [
|
| 53 |
+
{
|
| 54 |
+
"endpoint": "apispec",
|
| 55 |
+
"route": "/apispec.json",
|
| 56 |
+
"rule_filter": lambda rule: True, # Include all endpoints
|
| 57 |
+
"model_filter": lambda tag: True, # Include all models
|
| 58 |
+
}
|
| 59 |
+
],
|
| 60 |
+
"static_url_path": "/flasgger_static",
|
| 61 |
+
"swagger_ui": True,
|
| 62 |
+
"specs_route": "/apidocs/",
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
swagger = Swagger(
|
| 66 |
+
app,
|
| 67 |
+
config=swagger_config,
|
| 68 |
+
template={
|
| 69 |
+
"swagger": "2.0",
|
| 70 |
+
"info": {
|
| 71 |
+
"title": "RAGFlow API",
|
| 72 |
+
"description": "",
|
| 73 |
+
"version": "1.0.0",
|
| 74 |
+
},
|
| 75 |
+
"securityDefinitions": {
|
| 76 |
+
"ApiKeyAuth": {"type": "apiKey", "name": "Authorization", "in": "header"}
|
| 77 |
+
},
|
| 78 |
+
},
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
CORS(app, supports_credentials=True, max_age=2592000)
|
| 82 |
app.url_map.strict_slashes = False
|
| 83 |
app.json_encoder = CustomJSONEncoder
|
| 84 |
app.errorhandler(Exception)(server_error_response)
|
| 85 |
|
| 86 |
|
| 87 |
## convince for dev and debug
|
| 88 |
+
# app.config["LOGIN_DISABLED"] = True
|
| 89 |
app.config["SESSION_PERMANENT"] = False
|
| 90 |
app.config["SESSION_TYPE"] = "filesystem"
|
| 91 |
+
app.config["MAX_CONTENT_LENGTH"] = int(
|
| 92 |
+
os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024)
|
| 93 |
+
)
|
| 94 |
|
| 95 |
Session(app)
|
| 96 |
login_manager = LoginManager()
|
|
|
|
| 100 |
|
| 101 |
|
| 102 |
def search_pages_path(pages_dir):
|
| 103 |
+
app_path_list = [
|
| 104 |
+
path for path in pages_dir.glob("*_app.py") if not path.name.startswith(".")
|
| 105 |
+
]
|
| 106 |
+
api_path_list = [
|
| 107 |
+
path for path in pages_dir.glob("*sdk/*.py") if not path.name.startswith(".")
|
| 108 |
+
]
|
| 109 |
app_path_list.extend(api_path_list)
|
| 110 |
return app_path_list
|
| 111 |
|
| 112 |
|
| 113 |
def register_page(page_path):
|
| 114 |
+
path = f"{page_path}"
|
| 115 |
|
| 116 |
+
page_name = page_path.stem.rstrip("_app")
|
| 117 |
+
module_name = ".".join(
|
| 118 |
+
page_path.parts[page_path.parts.index("api") : -1] + (page_name,)
|
| 119 |
+
)
|
| 120 |
|
| 121 |
spec = spec_from_file_location(module_name, page_path)
|
| 122 |
page = module_from_spec(spec)
|
|
|
|
| 124 |
page.manager = Blueprint(page_name, module_name)
|
| 125 |
sys.modules[module_name] = page
|
| 126 |
spec.loader.exec_module(page)
|
| 127 |
+
page_name = getattr(page, "page_name", page_name)
|
| 128 |
+
url_prefix = (
|
| 129 |
+
f"/api/{API_VERSION}" if "/sdk/" in path else f"/{API_VERSION}/{page_name}"
|
| 130 |
+
)
|
| 131 |
|
| 132 |
app.register_blueprint(page.manager, url_prefix=url_prefix)
|
| 133 |
return url_prefix
|
|
|
|
| 135 |
|
| 136 |
pages_dir = [
|
| 137 |
Path(__file__).parent,
|
| 138 |
+
Path(__file__).parent.parent / "api" / "apps",
|
| 139 |
+
Path(__file__).parent.parent / "api" / "apps" / "sdk",
|
| 140 |
]
|
| 141 |
|
| 142 |
client_urls_prefix = [
|
| 143 |
+
register_page(path) for dir in pages_dir for path in search_pages_path(dir)
|
|
|
|
|
|
|
| 144 |
]
|
| 145 |
|
| 146 |
|
|
|
|
| 151 |
if authorization:
|
| 152 |
try:
|
| 153 |
access_token = str(jwt.loads(authorization))
|
| 154 |
+
user = UserService.query(
|
| 155 |
+
access_token=access_token, status=StatusEnum.VALID.value
|
| 156 |
+
)
|
| 157 |
if user:
|
| 158 |
return user[0]
|
| 159 |
else:
|
|
|
|
| 167 |
|
| 168 |
@app.teardown_request
|
| 169 |
def _db_close(exc):
|
| 170 |
+
close_connection()
|
api/apps/sdk/dataset.py
CHANGED
|
@@ -21,16 +21,72 @@ from api.db.services.document_service import DocumentService
|
|
| 21 |
from api.db.services.file2document_service import File2DocumentService
|
| 22 |
from api.db.services.file_service import FileService
|
| 23 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 24 |
-
from api.db.services.llm_service import TenantLLMService,LLMService
|
| 25 |
from api.db.services.user_service import TenantService
|
| 26 |
from api.settings import RetCode
|
| 27 |
from api.utils import get_uuid
|
| 28 |
-
from api.utils.api_utils import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
-
@manager.route(
|
| 32 |
@token_required
|
| 33 |
def create(tenant_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
req = request.json
|
| 35 |
e, t = TenantService.get_by_id(tenant_id)
|
| 36 |
permission = req.get("permission")
|
|
@@ -38,49 +94,97 @@ def create(tenant_id):
|
|
| 38 |
chunk_method = req.get("chunk_method")
|
| 39 |
parser_config = req.get("parser_config")
|
| 40 |
valid_permission = ["me", "team"]
|
| 41 |
-
valid_language =["Chinese", "English"]
|
| 42 |
-
valid_chunk_method = [
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
if check_validation:
|
| 45 |
return check_validation
|
| 46 |
-
req["parser_config"]=get_parser_config(chunk_method,parser_config)
|
| 47 |
if "tenant_id" in req:
|
| 48 |
-
return get_error_data_result(
|
| 49 |
-
retmsg="`tenant_id` must not be provided")
|
| 50 |
if "chunk_count" in req or "document_count" in req:
|
| 51 |
-
return get_error_data_result(retmsg="`chunk_count` or `document_count` must not be provided")
|
| 52 |
-
if "name" not in req:
|
| 53 |
return get_error_data_result(
|
| 54 |
-
retmsg="`
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
| 56 |
req["name"] = req["name"].strip()
|
| 57 |
if req["name"] == "":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return get_error_data_result(
|
| 59 |
-
retmsg="
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
retmsg="Duplicated dataset name in creating dataset.")
|
| 63 |
-
req["tenant_id"] = req['created_by'] = tenant_id
|
| 64 |
if not req.get("embedding_model"):
|
| 65 |
-
req[
|
| 66 |
else:
|
| 67 |
-
valid_embedding_models=[
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
if not embd_model:
|
| 73 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 74 |
if embd_model:
|
| 75 |
-
if req[
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
key_mapping = {
|
| 78 |
"chunk_num": "chunk_count",
|
| 79 |
"doc_num": "document_count",
|
| 80 |
"parser_id": "chunk_method",
|
| 81 |
-
"embd_id": "embedding_model"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
-
mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
|
| 84 |
req.update(mapped_keys)
|
| 85 |
if not KnowledgebaseService.save(**req):
|
| 86 |
return get_error_data_result(retmsg="Create dataset error.(Database error)")
|
|
@@ -91,21 +195,53 @@ def create(tenant_id):
|
|
| 91 |
renamed_data[new_key] = value
|
| 92 |
return get_result(data=renamed_data)
|
| 93 |
|
| 94 |
-
|
|
|
|
| 95 |
@token_required
|
| 96 |
def delete(tenant_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
req = request.json
|
| 98 |
if not req:
|
| 99 |
-
ids=None
|
| 100 |
else:
|
| 101 |
-
ids=req.get("ids")
|
| 102 |
if not ids:
|
| 103 |
id_list = []
|
| 104 |
-
kbs=KnowledgebaseService.query(tenant_id=tenant_id)
|
| 105 |
for kb in kbs:
|
| 106 |
id_list.append(kb.id)
|
| 107 |
else:
|
| 108 |
-
id_list=ids
|
| 109 |
for id in id_list:
|
| 110 |
kbs = KnowledgebaseService.query(id=id, tenant_id=tenant_id)
|
| 111 |
if not kbs:
|
|
@@ -113,19 +249,75 @@ def delete(tenant_id):
|
|
| 113 |
for doc in DocumentService.query(kb_id=id):
|
| 114 |
if not DocumentService.remove_document(doc, tenant_id):
|
| 115 |
return get_error_data_result(
|
| 116 |
-
retmsg="Remove document error.(Database error)"
|
|
|
|
| 117 |
f2d = File2DocumentService.get_by_document_id(doc.id)
|
| 118 |
-
FileService.filter_delete(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 120 |
if not KnowledgebaseService.delete_by_id(id):
|
| 121 |
-
return get_error_data_result(
|
| 122 |
-
retmsg="Delete dataset error.(Database error)")
|
| 123 |
return get_result(retcode=RetCode.SUCCESS)
|
| 124 |
|
| 125 |
-
|
|
|
|
| 126 |
@token_required
|
| 127 |
-
def update(tenant_id,dataset_id):
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
return get_error_data_result(retmsg="You don't own the dataset")
|
| 130 |
req = request.json
|
| 131 |
e, t = TenantService.get_by_id(tenant_id)
|
|
@@ -138,91 +330,202 @@ def update(tenant_id,dataset_id):
|
|
| 138 |
parser_config = req.get("parser_config")
|
| 139 |
valid_permission = ["me", "team"]
|
| 140 |
valid_language = ["Chinese", "English"]
|
| 141 |
-
valid_chunk_method = [
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
if check_validation:
|
| 145 |
return check_validation
|
| 146 |
if "tenant_id" in req:
|
| 147 |
if req["tenant_id"] != tenant_id:
|
| 148 |
-
return get_error_data_result(
|
| 149 |
-
retmsg="Can't change `tenant_id`.")
|
| 150 |
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
| 151 |
if "parser_config" in req:
|
| 152 |
-
temp_dict=kb.parser_config
|
| 153 |
temp_dict.update(req["parser_config"])
|
| 154 |
req["parser_config"] = temp_dict
|
| 155 |
if "chunk_count" in req:
|
| 156 |
if req["chunk_count"] != kb.chunk_num:
|
| 157 |
-
return get_error_data_result(
|
| 158 |
-
retmsg="Can't change `chunk_count`.")
|
| 159 |
req.pop("chunk_count")
|
| 160 |
if "document_count" in req:
|
| 161 |
-
if req[
|
| 162 |
-
return get_error_data_result(
|
| 163 |
-
retmsg="Can't change `document_count`.")
|
| 164 |
req.pop("document_count")
|
| 165 |
if "chunk_method" in req:
|
| 166 |
-
if kb.chunk_num != 0 and req[
|
| 167 |
return get_error_data_result(
|
| 168 |
-
retmsg="If `chunk_count` is not 0, `chunk_method` is not changeable."
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
if not req.get("parser_config"):
|
| 172 |
req["parser_config"] = get_parser_config(chunk_method, parser_config)
|
| 173 |
if "embedding_model" in req:
|
| 174 |
-
if kb.chunk_num != 0 and req[
|
| 175 |
return get_error_data_result(
|
| 176 |
-
retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable."
|
|
|
|
| 177 |
if not req.get("embedding_model"):
|
| 178 |
return get_error_data_result("`embedding_model` can't be empty")
|
| 179 |
-
valid_embedding_models=[
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
if not embd_model:
|
| 185 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 186 |
if embd_model:
|
| 187 |
-
if req[
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
if "name" in req:
|
| 191 |
req["name"] = req["name"].strip()
|
| 192 |
-
if
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
return get_error_data_result(
|
| 196 |
-
retmsg="Duplicated dataset name in updating dataset."
|
|
|
|
| 197 |
if not KnowledgebaseService.update_by_id(kb.id, req):
|
| 198 |
return get_error_data_result(retmsg="Update dataset error.(Database error)")
|
| 199 |
return get_result(retcode=RetCode.SUCCESS)
|
| 200 |
|
| 201 |
-
|
|
|
|
| 202 |
@token_required
|
| 203 |
def list(tenant_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
id = request.args.get("id")
|
| 205 |
name = request.args.get("name")
|
| 206 |
-
kbs = KnowledgebaseService.query(id=id,name=name,status=1)
|
| 207 |
if not kbs:
|
| 208 |
return get_error_data_result(retmsg="The dataset doesn't exist")
|
| 209 |
page_number = int(request.args.get("page", 1))
|
| 210 |
items_per_page = int(request.args.get("page_size", 1024))
|
| 211 |
orderby = request.args.get("orderby", "create_time")
|
| 212 |
-
if request.args.get("desc") == "False" or request.args.get("desc") == "false"
|
| 213 |
desc = False
|
| 214 |
else:
|
| 215 |
desc = True
|
| 216 |
tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
|
| 217 |
kbs = KnowledgebaseService.get_list(
|
| 218 |
-
[m["tenant_id"] for m in tenants],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
renamed_list = []
|
| 220 |
for kb in kbs:
|
| 221 |
key_mapping = {
|
| 222 |
"chunk_num": "chunk_count",
|
| 223 |
"doc_num": "document_count",
|
| 224 |
"parser_id": "chunk_method",
|
| 225 |
-
"embd_id": "embedding_model"
|
| 226 |
}
|
| 227 |
renamed_data = {}
|
| 228 |
for key, value in kb.items():
|
|
|
|
| 21 |
from api.db.services.file2document_service import File2DocumentService
|
| 22 |
from api.db.services.file_service import FileService
|
| 23 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 24 |
+
from api.db.services.llm_service import TenantLLMService, LLMService
|
| 25 |
from api.db.services.user_service import TenantService
|
| 26 |
from api.settings import RetCode
|
| 27 |
from api.utils import get_uuid
|
| 28 |
+
from api.utils.api_utils import (
|
| 29 |
+
get_result,
|
| 30 |
+
token_required,
|
| 31 |
+
get_error_data_result,
|
| 32 |
+
valid,
|
| 33 |
+
get_parser_config,
|
| 34 |
+
)
|
| 35 |
|
| 36 |
|
| 37 |
+
@manager.route("/datasets", methods=["POST"])
|
| 38 |
@token_required
|
| 39 |
def create(tenant_id):
|
| 40 |
+
"""
|
| 41 |
+
Create a new dataset.
|
| 42 |
+
---
|
| 43 |
+
tags:
|
| 44 |
+
- Datasets
|
| 45 |
+
security:
|
| 46 |
+
- ApiKeyAuth: []
|
| 47 |
+
parameters:
|
| 48 |
+
- in: header
|
| 49 |
+
name: Authorization
|
| 50 |
+
type: string
|
| 51 |
+
required: true
|
| 52 |
+
description: Bearer token for authentication.
|
| 53 |
+
- in: body
|
| 54 |
+
name: body
|
| 55 |
+
description: Dataset creation parameters.
|
| 56 |
+
required: true
|
| 57 |
+
schema:
|
| 58 |
+
type: object
|
| 59 |
+
required:
|
| 60 |
+
- name
|
| 61 |
+
properties:
|
| 62 |
+
name:
|
| 63 |
+
type: string
|
| 64 |
+
description: Name of the dataset.
|
| 65 |
+
permission:
|
| 66 |
+
type: string
|
| 67 |
+
enum: ['me', 'team']
|
| 68 |
+
description: Dataset permission.
|
| 69 |
+
language:
|
| 70 |
+
type: string
|
| 71 |
+
enum: ['Chinese', 'English']
|
| 72 |
+
description: Language of the dataset.
|
| 73 |
+
chunk_method:
|
| 74 |
+
type: string
|
| 75 |
+
enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
|
| 76 |
+
"presentation", "picture", "one", "knowledge_graph", "email"]
|
| 77 |
+
description: Chunking method.
|
| 78 |
+
parser_config:
|
| 79 |
+
type: object
|
| 80 |
+
description: Parser configuration.
|
| 81 |
+
responses:
|
| 82 |
+
200:
|
| 83 |
+
description: Successful operation.
|
| 84 |
+
schema:
|
| 85 |
+
type: object
|
| 86 |
+
properties:
|
| 87 |
+
data:
|
| 88 |
+
type: object
|
| 89 |
+
"""
|
| 90 |
req = request.json
|
| 91 |
e, t = TenantService.get_by_id(tenant_id)
|
| 92 |
permission = req.get("permission")
|
|
|
|
| 94 |
chunk_method = req.get("chunk_method")
|
| 95 |
parser_config = req.get("parser_config")
|
| 96 |
valid_permission = ["me", "team"]
|
| 97 |
+
valid_language = ["Chinese", "English"]
|
| 98 |
+
valid_chunk_method = [
|
| 99 |
+
"naive",
|
| 100 |
+
"manual",
|
| 101 |
+
"qa",
|
| 102 |
+
"table",
|
| 103 |
+
"paper",
|
| 104 |
+
"book",
|
| 105 |
+
"laws",
|
| 106 |
+
"presentation",
|
| 107 |
+
"picture",
|
| 108 |
+
"one",
|
| 109 |
+
"knowledge_graph",
|
| 110 |
+
"email",
|
| 111 |
+
]
|
| 112 |
+
check_validation = valid(
|
| 113 |
+
permission,
|
| 114 |
+
valid_permission,
|
| 115 |
+
language,
|
| 116 |
+
valid_language,
|
| 117 |
+
chunk_method,
|
| 118 |
+
valid_chunk_method,
|
| 119 |
+
)
|
| 120 |
if check_validation:
|
| 121 |
return check_validation
|
| 122 |
+
req["parser_config"] = get_parser_config(chunk_method, parser_config)
|
| 123 |
if "tenant_id" in req:
|
| 124 |
+
return get_error_data_result(retmsg="`tenant_id` must not be provided")
|
|
|
|
| 125 |
if "chunk_count" in req or "document_count" in req:
|
|
|
|
|
|
|
| 126 |
return get_error_data_result(
|
| 127 |
+
retmsg="`chunk_count` or `document_count` must not be provided"
|
| 128 |
+
)
|
| 129 |
+
if "name" not in req:
|
| 130 |
+
return get_error_data_result(retmsg="`name` is not empty!")
|
| 131 |
+
req["id"] = get_uuid()
|
| 132 |
req["name"] = req["name"].strip()
|
| 133 |
if req["name"] == "":
|
| 134 |
+
return get_error_data_result(retmsg="`name` is not empty string!")
|
| 135 |
+
if KnowledgebaseService.query(
|
| 136 |
+
name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value
|
| 137 |
+
):
|
| 138 |
return get_error_data_result(
|
| 139 |
+
retmsg="Duplicated dataset name in creating dataset."
|
| 140 |
+
)
|
| 141 |
+
req["tenant_id"] = req["created_by"] = tenant_id
|
|
|
|
|
|
|
| 142 |
if not req.get("embedding_model"):
|
| 143 |
+
req["embedding_model"] = t.embd_id
|
| 144 |
else:
|
| 145 |
+
valid_embedding_models = [
|
| 146 |
+
"BAAI/bge-large-zh-v1.5",
|
| 147 |
+
"BAAI/bge-base-en-v1.5",
|
| 148 |
+
"BAAI/bge-large-en-v1.5",
|
| 149 |
+
"BAAI/bge-small-en-v1.5",
|
| 150 |
+
"BAAI/bge-small-zh-v1.5",
|
| 151 |
+
"jinaai/jina-embeddings-v2-base-en",
|
| 152 |
+
"jinaai/jina-embeddings-v2-small-en",
|
| 153 |
+
"nomic-ai/nomic-embed-text-v1.5",
|
| 154 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
| 155 |
+
"text-embedding-v2",
|
| 156 |
+
"text-embedding-v3",
|
| 157 |
+
"maidalun1020/bce-embedding-base_v1",
|
| 158 |
+
]
|
| 159 |
+
embd_model = LLMService.query(
|
| 160 |
+
llm_name=req["embedding_model"], model_type="embedding"
|
| 161 |
+
)
|
| 162 |
if not embd_model:
|
| 163 |
+
return get_error_data_result(
|
| 164 |
+
f"`embedding_model` {req.get('embedding_model')} doesn't exist"
|
| 165 |
+
)
|
| 166 |
if embd_model:
|
| 167 |
+
if req[
|
| 168 |
+
"embedding_model"
|
| 169 |
+
] not in valid_embedding_models and not TenantLLMService.query(
|
| 170 |
+
tenant_id=tenant_id,
|
| 171 |
+
model_type="embedding",
|
| 172 |
+
llm_name=req.get("embedding_model"),
|
| 173 |
+
):
|
| 174 |
+
return get_error_data_result(
|
| 175 |
+
f"`embedding_model` {req.get('embedding_model')} doesn't exist"
|
| 176 |
+
)
|
| 177 |
key_mapping = {
|
| 178 |
"chunk_num": "chunk_count",
|
| 179 |
"doc_num": "document_count",
|
| 180 |
"parser_id": "chunk_method",
|
| 181 |
+
"embd_id": "embedding_model",
|
| 182 |
+
}
|
| 183 |
+
mapped_keys = {
|
| 184 |
+
new_key: req[old_key]
|
| 185 |
+
for new_key, old_key in key_mapping.items()
|
| 186 |
+
if old_key in req
|
| 187 |
}
|
|
|
|
| 188 |
req.update(mapped_keys)
|
| 189 |
if not KnowledgebaseService.save(**req):
|
| 190 |
return get_error_data_result(retmsg="Create dataset error.(Database error)")
|
|
|
|
| 195 |
renamed_data[new_key] = value
|
| 196 |
return get_result(data=renamed_data)
|
| 197 |
|
| 198 |
+
|
| 199 |
+
@manager.route("/datasets", methods=["DELETE"])
|
| 200 |
@token_required
|
| 201 |
def delete(tenant_id):
|
| 202 |
+
"""
|
| 203 |
+
Delete datasets.
|
| 204 |
+
---
|
| 205 |
+
tags:
|
| 206 |
+
- Datasets
|
| 207 |
+
security:
|
| 208 |
+
- ApiKeyAuth: []
|
| 209 |
+
parameters:
|
| 210 |
+
- in: header
|
| 211 |
+
name: Authorization
|
| 212 |
+
type: string
|
| 213 |
+
required: true
|
| 214 |
+
description: Bearer token for authentication.
|
| 215 |
+
- in: body
|
| 216 |
+
name: body
|
| 217 |
+
description: Dataset deletion parameters.
|
| 218 |
+
required: true
|
| 219 |
+
schema:
|
| 220 |
+
type: object
|
| 221 |
+
properties:
|
| 222 |
+
ids:
|
| 223 |
+
type: array
|
| 224 |
+
items:
|
| 225 |
+
type: string
|
| 226 |
+
description: List of dataset IDs to delete.
|
| 227 |
+
responses:
|
| 228 |
+
200:
|
| 229 |
+
description: Successful operation.
|
| 230 |
+
schema:
|
| 231 |
+
type: object
|
| 232 |
+
"""
|
| 233 |
req = request.json
|
| 234 |
if not req:
|
| 235 |
+
ids = None
|
| 236 |
else:
|
| 237 |
+
ids = req.get("ids")
|
| 238 |
if not ids:
|
| 239 |
id_list = []
|
| 240 |
+
kbs = KnowledgebaseService.query(tenant_id=tenant_id)
|
| 241 |
for kb in kbs:
|
| 242 |
id_list.append(kb.id)
|
| 243 |
else:
|
| 244 |
+
id_list = ids
|
| 245 |
for id in id_list:
|
| 246 |
kbs = KnowledgebaseService.query(id=id, tenant_id=tenant_id)
|
| 247 |
if not kbs:
|
|
|
|
| 249 |
for doc in DocumentService.query(kb_id=id):
|
| 250 |
if not DocumentService.remove_document(doc, tenant_id):
|
| 251 |
return get_error_data_result(
|
| 252 |
+
retmsg="Remove document error.(Database error)"
|
| 253 |
+
)
|
| 254 |
f2d = File2DocumentService.get_by_document_id(doc.id)
|
| 255 |
+
FileService.filter_delete(
|
| 256 |
+
[
|
| 257 |
+
File.source_type == FileSource.KNOWLEDGEBASE,
|
| 258 |
+
File.id == f2d[0].file_id,
|
| 259 |
+
]
|
| 260 |
+
)
|
| 261 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 262 |
if not KnowledgebaseService.delete_by_id(id):
|
| 263 |
+
return get_error_data_result(retmsg="Delete dataset error.(Database error)")
|
|
|
|
| 264 |
return get_result(retcode=RetCode.SUCCESS)
|
| 265 |
|
| 266 |
+
|
| 267 |
+
@manager.route("/datasets/<dataset_id>", methods=["PUT"])
|
| 268 |
@token_required
|
| 269 |
+
def update(tenant_id, dataset_id):
|
| 270 |
+
"""
|
| 271 |
+
Update a dataset.
|
| 272 |
+
---
|
| 273 |
+
tags:
|
| 274 |
+
- Datasets
|
| 275 |
+
security:
|
| 276 |
+
- ApiKeyAuth: []
|
| 277 |
+
parameters:
|
| 278 |
+
- in: path
|
| 279 |
+
name: dataset_id
|
| 280 |
+
type: string
|
| 281 |
+
required: true
|
| 282 |
+
description: ID of the dataset to update.
|
| 283 |
+
- in: header
|
| 284 |
+
name: Authorization
|
| 285 |
+
type: string
|
| 286 |
+
required: true
|
| 287 |
+
description: Bearer token for authentication.
|
| 288 |
+
- in: body
|
| 289 |
+
name: body
|
| 290 |
+
description: Dataset update parameters.
|
| 291 |
+
required: true
|
| 292 |
+
schema:
|
| 293 |
+
type: object
|
| 294 |
+
properties:
|
| 295 |
+
name:
|
| 296 |
+
type: string
|
| 297 |
+
description: New name of the dataset.
|
| 298 |
+
permission:
|
| 299 |
+
type: string
|
| 300 |
+
enum: ['me', 'team']
|
| 301 |
+
description: Updated permission.
|
| 302 |
+
language:
|
| 303 |
+
type: string
|
| 304 |
+
enum: ['Chinese', 'English']
|
| 305 |
+
description: Updated language.
|
| 306 |
+
chunk_method:
|
| 307 |
+
type: string
|
| 308 |
+
enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
|
| 309 |
+
"presentation", "picture", "one", "knowledge_graph", "email"]
|
| 310 |
+
description: Updated chunking method.
|
| 311 |
+
parser_config:
|
| 312 |
+
type: object
|
| 313 |
+
description: Updated parser configuration.
|
| 314 |
+
responses:
|
| 315 |
+
200:
|
| 316 |
+
description: Successful operation.
|
| 317 |
+
schema:
|
| 318 |
+
type: object
|
| 319 |
+
"""
|
| 320 |
+
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 321 |
return get_error_data_result(retmsg="You don't own the dataset")
|
| 322 |
req = request.json
|
| 323 |
e, t = TenantService.get_by_id(tenant_id)
|
|
|
|
| 330 |
parser_config = req.get("parser_config")
|
| 331 |
valid_permission = ["me", "team"]
|
| 332 |
valid_language = ["Chinese", "English"]
|
| 333 |
+
valid_chunk_method = [
|
| 334 |
+
"naive",
|
| 335 |
+
"manual",
|
| 336 |
+
"qa",
|
| 337 |
+
"table",
|
| 338 |
+
"paper",
|
| 339 |
+
"book",
|
| 340 |
+
"laws",
|
| 341 |
+
"presentation",
|
| 342 |
+
"picture",
|
| 343 |
+
"one",
|
| 344 |
+
"knowledge_graph",
|
| 345 |
+
"email",
|
| 346 |
+
]
|
| 347 |
+
check_validation = valid(
|
| 348 |
+
permission,
|
| 349 |
+
valid_permission,
|
| 350 |
+
language,
|
| 351 |
+
valid_language,
|
| 352 |
+
chunk_method,
|
| 353 |
+
valid_chunk_method,
|
| 354 |
+
)
|
| 355 |
if check_validation:
|
| 356 |
return check_validation
|
| 357 |
if "tenant_id" in req:
|
| 358 |
if req["tenant_id"] != tenant_id:
|
| 359 |
+
return get_error_data_result(retmsg="Can't change `tenant_id`.")
|
|
|
|
| 360 |
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
| 361 |
if "parser_config" in req:
|
| 362 |
+
temp_dict = kb.parser_config
|
| 363 |
temp_dict.update(req["parser_config"])
|
| 364 |
req["parser_config"] = temp_dict
|
| 365 |
if "chunk_count" in req:
|
| 366 |
if req["chunk_count"] != kb.chunk_num:
|
| 367 |
+
return get_error_data_result(retmsg="Can't change `chunk_count`.")
|
|
|
|
| 368 |
req.pop("chunk_count")
|
| 369 |
if "document_count" in req:
|
| 370 |
+
if req["document_count"] != kb.doc_num:
|
| 371 |
+
return get_error_data_result(retmsg="Can't change `document_count`.")
|
|
|
|
| 372 |
req.pop("document_count")
|
| 373 |
if "chunk_method" in req:
|
| 374 |
+
if kb.chunk_num != 0 and req["chunk_method"] != kb.parser_id:
|
| 375 |
return get_error_data_result(
|
| 376 |
+
retmsg="If `chunk_count` is not 0, `chunk_method` is not changeable."
|
| 377 |
+
)
|
| 378 |
+
req["parser_id"] = req.pop("chunk_method")
|
| 379 |
+
if req["parser_id"] != kb.parser_id:
|
| 380 |
if not req.get("parser_config"):
|
| 381 |
req["parser_config"] = get_parser_config(chunk_method, parser_config)
|
| 382 |
if "embedding_model" in req:
|
| 383 |
+
if kb.chunk_num != 0 and req["embedding_model"] != kb.embd_id:
|
| 384 |
return get_error_data_result(
|
| 385 |
+
retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable."
|
| 386 |
+
)
|
| 387 |
if not req.get("embedding_model"):
|
| 388 |
return get_error_data_result("`embedding_model` can't be empty")
|
| 389 |
+
valid_embedding_models = [
|
| 390 |
+
"BAAI/bge-large-zh-v1.5",
|
| 391 |
+
"BAAI/bge-base-en-v1.5",
|
| 392 |
+
"BAAI/bge-large-en-v1.5",
|
| 393 |
+
"BAAI/bge-small-en-v1.5",
|
| 394 |
+
"BAAI/bge-small-zh-v1.5",
|
| 395 |
+
"jinaai/jina-embeddings-v2-base-en",
|
| 396 |
+
"jinaai/jina-embeddings-v2-small-en",
|
| 397 |
+
"nomic-ai/nomic-embed-text-v1.5",
|
| 398 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
| 399 |
+
"text-embedding-v2",
|
| 400 |
+
"text-embedding-v3",
|
| 401 |
+
"maidalun1020/bce-embedding-base_v1",
|
| 402 |
+
]
|
| 403 |
+
embd_model = LLMService.query(
|
| 404 |
+
llm_name=req["embedding_model"], model_type="embedding"
|
| 405 |
+
)
|
| 406 |
if not embd_model:
|
| 407 |
+
return get_error_data_result(
|
| 408 |
+
f"`embedding_model` {req.get('embedding_model')} doesn't exist"
|
| 409 |
+
)
|
| 410 |
if embd_model:
|
| 411 |
+
if req[
|
| 412 |
+
"embedding_model"
|
| 413 |
+
] not in valid_embedding_models and not TenantLLMService.query(
|
| 414 |
+
tenant_id=tenant_id,
|
| 415 |
+
model_type="embedding",
|
| 416 |
+
llm_name=req.get("embedding_model"),
|
| 417 |
+
):
|
| 418 |
+
return get_error_data_result(
|
| 419 |
+
f"`embedding_model` {req.get('embedding_model')} doesn't exist"
|
| 420 |
+
)
|
| 421 |
+
req["embd_id"] = req.pop("embedding_model")
|
| 422 |
if "name" in req:
|
| 423 |
req["name"] = req["name"].strip()
|
| 424 |
+
if (
|
| 425 |
+
req["name"].lower() != kb.name.lower()
|
| 426 |
+
and len(
|
| 427 |
+
KnowledgebaseService.query(
|
| 428 |
+
name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value
|
| 429 |
+
)
|
| 430 |
+
)
|
| 431 |
+
> 0
|
| 432 |
+
):
|
| 433 |
return get_error_data_result(
|
| 434 |
+
retmsg="Duplicated dataset name in updating dataset."
|
| 435 |
+
)
|
| 436 |
if not KnowledgebaseService.update_by_id(kb.id, req):
|
| 437 |
return get_error_data_result(retmsg="Update dataset error.(Database error)")
|
| 438 |
return get_result(retcode=RetCode.SUCCESS)
|
| 439 |
|
| 440 |
+
|
| 441 |
+
@manager.route("/datasets", methods=["GET"])
|
| 442 |
@token_required
|
| 443 |
def list(tenant_id):
|
| 444 |
+
"""
|
| 445 |
+
List datasets.
|
| 446 |
+
---
|
| 447 |
+
tags:
|
| 448 |
+
- Datasets
|
| 449 |
+
security:
|
| 450 |
+
- ApiKeyAuth: []
|
| 451 |
+
parameters:
|
| 452 |
+
- in: query
|
| 453 |
+
name: id
|
| 454 |
+
type: string
|
| 455 |
+
required: false
|
| 456 |
+
description: Dataset ID to filter.
|
| 457 |
+
- in: query
|
| 458 |
+
name: name
|
| 459 |
+
type: string
|
| 460 |
+
required: false
|
| 461 |
+
description: Dataset name to filter.
|
| 462 |
+
- in: query
|
| 463 |
+
name: page
|
| 464 |
+
type: integer
|
| 465 |
+
required: false
|
| 466 |
+
default: 1
|
| 467 |
+
description: Page number.
|
| 468 |
+
- in: query
|
| 469 |
+
name: page_size
|
| 470 |
+
type: integer
|
| 471 |
+
required: false
|
| 472 |
+
default: 1024
|
| 473 |
+
description: Number of items per page.
|
| 474 |
+
- in: query
|
| 475 |
+
name: orderby
|
| 476 |
+
type: string
|
| 477 |
+
required: false
|
| 478 |
+
default: "create_time"
|
| 479 |
+
description: Field to order by.
|
| 480 |
+
- in: query
|
| 481 |
+
name: desc
|
| 482 |
+
type: boolean
|
| 483 |
+
required: false
|
| 484 |
+
default: true
|
| 485 |
+
description: Order in descending.
|
| 486 |
+
- in: header
|
| 487 |
+
name: Authorization
|
| 488 |
+
type: string
|
| 489 |
+
required: true
|
| 490 |
+
description: Bearer token for authentication.
|
| 491 |
+
responses:
|
| 492 |
+
200:
|
| 493 |
+
description: Successful operation.
|
| 494 |
+
schema:
|
| 495 |
+
type: array
|
| 496 |
+
items:
|
| 497 |
+
type: object
|
| 498 |
+
"""
|
| 499 |
id = request.args.get("id")
|
| 500 |
name = request.args.get("name")
|
| 501 |
+
kbs = KnowledgebaseService.query(id=id, name=name, status=1)
|
| 502 |
if not kbs:
|
| 503 |
return get_error_data_result(retmsg="The dataset doesn't exist")
|
| 504 |
page_number = int(request.args.get("page", 1))
|
| 505 |
items_per_page = int(request.args.get("page_size", 1024))
|
| 506 |
orderby = request.args.get("orderby", "create_time")
|
| 507 |
+
if request.args.get("desc") == "False" or request.args.get("desc") == "false":
|
| 508 |
desc = False
|
| 509 |
else:
|
| 510 |
desc = True
|
| 511 |
tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
|
| 512 |
kbs = KnowledgebaseService.get_list(
|
| 513 |
+
[m["tenant_id"] for m in tenants],
|
| 514 |
+
tenant_id,
|
| 515 |
+
page_number,
|
| 516 |
+
items_per_page,
|
| 517 |
+
orderby,
|
| 518 |
+
desc,
|
| 519 |
+
id,
|
| 520 |
+
name,
|
| 521 |
+
)
|
| 522 |
renamed_list = []
|
| 523 |
for kb in kbs:
|
| 524 |
key_mapping = {
|
| 525 |
"chunk_num": "chunk_count",
|
| 526 |
"doc_num": "document_count",
|
| 527 |
"parser_id": "chunk_method",
|
| 528 |
+
"embd_id": "embedding_model",
|
| 529 |
}
|
| 530 |
renamed_data = {}
|
| 531 |
for key, value in kb.items():
|
api/apps/sdk/doc.py
CHANGED
|
@@ -39,7 +39,7 @@ from api.db.services.file2document_service import File2DocumentService
|
|
| 39 |
from api.db.services.file_service import FileService
|
| 40 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 41 |
from api.settings import RetCode, retrievaler
|
| 42 |
-
from api.utils.api_utils import construct_json_result,get_parser_config
|
| 43 |
from rag.nlp import search
|
| 44 |
from rag.utils import rmSpace
|
| 45 |
from rag.utils.es_conn import ELASTICSEARCH
|
|
@@ -49,36 +49,93 @@ import os
|
|
| 49 |
MAXIMUM_OF_UPLOADING_FILES = 256
|
| 50 |
|
| 51 |
|
| 52 |
-
|
| 53 |
-
@manager.route('/datasets/<dataset_id>/documents', methods=['POST'])
|
| 54 |
@token_required
|
| 55 |
def upload(dataset_id, tenant_id):
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
return get_error_data_result(
|
| 58 |
-
retmsg=
|
| 59 |
-
|
|
|
|
| 60 |
for file_obj in file_objs:
|
| 61 |
-
if file_obj.filename ==
|
| 62 |
return get_result(
|
| 63 |
-
retmsg=
|
|
|
|
| 64 |
# total size
|
| 65 |
total_size = 0
|
| 66 |
for file_obj in file_objs:
|
| 67 |
file_obj.seek(0, os.SEEK_END)
|
| 68 |
total_size += file_obj.tell()
|
| 69 |
file_obj.seek(0)
|
| 70 |
-
MAX_TOTAL_FILE_SIZE=10*1024*1024
|
| 71 |
if total_size > MAX_TOTAL_FILE_SIZE:
|
| 72 |
return get_result(
|
| 73 |
-
retmsg=f
|
| 74 |
-
retcode=RetCode.ARGUMENT_ERROR
|
|
|
|
| 75 |
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
| 76 |
if not e:
|
| 77 |
raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
|
| 78 |
-
err, files= FileService.upload_document(kb, file_objs, tenant_id)
|
| 79 |
if err:
|
| 80 |
-
return get_result(
|
| 81 |
-
retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
| 82 |
# rename key's name
|
| 83 |
renamed_doc_list = []
|
| 84 |
for file in files:
|
|
@@ -87,7 +144,7 @@ def upload(dataset_id, tenant_id):
|
|
| 87 |
"chunk_num": "chunk_count",
|
| 88 |
"kb_id": "dataset_id",
|
| 89 |
"token_num": "token_count",
|
| 90 |
-
"parser_id": "chunk_method"
|
| 91 |
}
|
| 92 |
renamed_doc = {}
|
| 93 |
for key, value in doc.items():
|
|
@@ -98,9 +155,54 @@ def upload(dataset_id, tenant_id):
|
|
| 98 |
return get_result(data=renamed_doc_list)
|
| 99 |
|
| 100 |
|
| 101 |
-
@manager.route(
|
| 102 |
@token_required
|
| 103 |
def update_doc(tenant_id, dataset_id, document_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
req = request.json
|
| 105 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 106 |
return get_error_data_result(retmsg="You don't own the dataset.")
|
|
@@ -115,20 +217,25 @@ def update_doc(tenant_id, dataset_id, document_id):
|
|
| 115 |
if req["token_count"] != doc.token_num:
|
| 116 |
return get_error_data_result(retmsg="Can't change `token_count`.")
|
| 117 |
if "progress" in req:
|
| 118 |
-
if req[
|
| 119 |
return get_error_data_result(retmsg="Can't change `progress`.")
|
| 120 |
|
| 121 |
if "name" in req and req["name"] != doc.name:
|
| 122 |
-
if
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
|
| 125 |
if d.name == req["name"]:
|
| 126 |
return get_error_data_result(
|
| 127 |
-
retmsg="Duplicated document name in the same dataset."
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
return get_error_data_result(
|
| 131 |
-
retmsg="Database error (Document rename)!")
|
| 132 |
|
| 133 |
informs = File2DocumentService.get_by_document_id(document_id)
|
| 134 |
if informs:
|
|
@@ -137,77 +244,231 @@ def update_doc(tenant_id, dataset_id, document_id):
|
|
| 137 |
if "parser_config" in req:
|
| 138 |
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
| 139 |
if "chunk_method" in req:
|
| 140 |
-
valid_chunk_method = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
if req.get("chunk_method") not in valid_chunk_method:
|
| 142 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 143 |
if doc.parser_id.lower() == req["chunk_method"].lower():
|
| 144 |
-
|
| 145 |
|
| 146 |
-
if doc.type == FileType.VISUAL or re.search(
|
| 147 |
-
r"\.(ppt|pptx|pages)$", doc.name):
|
| 148 |
return get_error_data_result(retmsg="Not supported yet!")
|
| 149 |
|
| 150 |
-
e = DocumentService.update_by_id(
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
if not e:
|
| 154 |
return get_error_data_result(retmsg="Document not found!")
|
| 155 |
-
req["parser_config"] = get_parser_config(
|
|
|
|
|
|
|
| 156 |
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
| 157 |
if doc.token_num > 0:
|
| 158 |
-
e = DocumentService.increment_chunk_num(
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
if not e:
|
| 161 |
return get_error_data_result(retmsg="Document not found!")
|
| 162 |
ELASTICSEARCH.deleteByQuery(
|
| 163 |
-
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)
|
|
|
|
| 164 |
|
| 165 |
return get_result()
|
| 166 |
|
| 167 |
|
| 168 |
-
@manager.route(
|
| 169 |
@token_required
|
| 170 |
def download(tenant_id, dataset_id, document_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 172 |
-
return get_error_data_result(retmsg=f
|
| 173 |
doc = DocumentService.query(kb_id=dataset_id, id=document_id)
|
| 174 |
if not doc:
|
| 175 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 176 |
# The process of downloading
|
| 177 |
-
doc_id, doc_location = File2DocumentService.get_storage_address(
|
|
|
|
|
|
|
| 178 |
file_stream = STORAGE_IMPL.get(doc_id, doc_location)
|
| 179 |
if not file_stream:
|
| 180 |
-
return construct_json_result(
|
|
|
|
|
|
|
| 181 |
file = BytesIO(file_stream)
|
| 182 |
# Use send_file with a proper filename and MIME type
|
| 183 |
return send_file(
|
| 184 |
file,
|
| 185 |
as_attachment=True,
|
| 186 |
download_name=doc[0].name,
|
| 187 |
-
mimetype=
|
| 188 |
)
|
| 189 |
|
| 190 |
|
| 191 |
-
@manager.route(
|
| 192 |
@token_required
|
| 193 |
def list_docs(dataset_id, tenant_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 195 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
|
| 196 |
id = request.args.get("id")
|
| 197 |
name = request.args.get("name")
|
| 198 |
-
if not DocumentService.query(id=id,kb_id=dataset_id):
|
| 199 |
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
| 200 |
-
if not DocumentService.query(name=name,kb_id=dataset_id):
|
| 201 |
return get_error_data_result(retmsg=f"You don't own the document {name}.")
|
| 202 |
offset = int(request.args.get("offset", 1))
|
| 203 |
-
keywords = request.args.get("keywords","")
|
| 204 |
limit = int(request.args.get("limit", 1024))
|
| 205 |
orderby = request.args.get("orderby", "create_time")
|
| 206 |
if request.args.get("desc") == "False":
|
| 207 |
desc = False
|
| 208 |
else:
|
| 209 |
desc = True
|
| 210 |
-
docs, tol = DocumentService.get_list(
|
|
|
|
|
|
|
| 211 |
|
| 212 |
# rename key's name
|
| 213 |
renamed_doc_list = []
|
|
@@ -216,42 +477,80 @@ def list_docs(dataset_id, tenant_id):
|
|
| 216 |
"chunk_num": "chunk_count",
|
| 217 |
"kb_id": "dataset_id",
|
| 218 |
"token_num": "token_count",
|
| 219 |
-
"parser_id": "chunk_method"
|
| 220 |
}
|
| 221 |
run_mapping = {
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
}
|
| 228 |
renamed_doc = {}
|
| 229 |
for key, value in doc.items():
|
|
|
|
|
|
|
| 230 |
new_key = key_mapping.get(key, key)
|
| 231 |
renamed_doc[new_key] = value
|
| 232 |
-
if key =="run":
|
| 233 |
-
renamed_doc["run"]=run_mapping.get(value)
|
| 234 |
renamed_doc_list.append(renamed_doc)
|
| 235 |
return get_result(data={"total": tol, "docs": renamed_doc_list})
|
| 236 |
|
| 237 |
|
| 238 |
-
@manager.route(
|
| 239 |
@token_required
|
| 240 |
-
def delete(tenant_id,dataset_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 242 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
|
| 243 |
req = request.json
|
| 244 |
if not req:
|
| 245 |
-
doc_ids=None
|
| 246 |
else:
|
| 247 |
-
doc_ids=req.get("ids")
|
| 248 |
if not doc_ids:
|
| 249 |
doc_list = []
|
| 250 |
-
docs=DocumentService.query(kb_id=dataset_id)
|
| 251 |
for doc in docs:
|
| 252 |
doc_list.append(doc.id)
|
| 253 |
else:
|
| 254 |
-
doc_list=doc_ids
|
| 255 |
root_folder = FileService.get_root_folder(tenant_id)
|
| 256 |
pf_id = root_folder["id"]
|
| 257 |
FileService.init_knowledgebase_docs(pf_id, tenant_id)
|
|
@@ -269,10 +568,16 @@ def delete(tenant_id,dataset_id):
|
|
| 269 |
|
| 270 |
if not DocumentService.remove_document(doc, tenant_id):
|
| 271 |
return get_error_data_result(
|
| 272 |
-
retmsg="Database error (Document removal)!"
|
|
|
|
| 273 |
|
| 274 |
f2d = File2DocumentService.get_by_document_id(doc_id)
|
| 275 |
-
FileService.filter_delete(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
File2DocumentService.delete_by_document_id(doc_id)
|
| 277 |
|
| 278 |
STORAGE_IMPL.rm(b, n)
|
|
@@ -285,25 +590,66 @@ def delete(tenant_id,dataset_id):
|
|
| 285 |
return get_result()
|
| 286 |
|
| 287 |
|
| 288 |
-
@manager.route(
|
| 289 |
@token_required
|
| 290 |
-
def parse(tenant_id,dataset_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 292 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 293 |
req = request.json
|
| 294 |
if not req.get("document_ids"):
|
| 295 |
return get_error_data_result("`document_ids` is required")
|
| 296 |
for id in req["document_ids"]:
|
| 297 |
-
doc = DocumentService.query(id=id,kb_id=dataset_id)
|
| 298 |
if not doc:
|
| 299 |
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
info = {"run": "1", "progress": 0}
|
| 301 |
info["progress_msg"] = ""
|
| 302 |
info["chunk_num"] = 0
|
| 303 |
info["token_num"] = 0
|
| 304 |
DocumentService.update_by_id(id, info)
|
| 305 |
ELASTICSEARCH.deleteByQuery(
|
| 306 |
-
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
|
|
|
|
| 307 |
TaskService.filter_delete([Task.doc_id == id])
|
| 308 |
e, doc = DocumentService.get_by_id(id)
|
| 309 |
doc = doc.to_dict()
|
|
@@ -312,9 +658,46 @@ def parse(tenant_id,dataset_id):
|
|
| 312 |
queue_tasks(doc, bucket, name)
|
| 313 |
return get_result()
|
| 314 |
|
| 315 |
-
|
|
|
|
| 316 |
@token_required
|
| 317 |
-
def stop_parsing(tenant_id,dataset_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 319 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 320 |
req = request.json
|
|
@@ -325,46 +708,125 @@ def stop_parsing(tenant_id,dataset_id):
|
|
| 325 |
if not doc:
|
| 326 |
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
| 327 |
if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
|
| 328 |
-
return get_error_data_result(
|
| 329 |
-
|
|
|
|
|
|
|
| 330 |
DocumentService.update_by_id(id, info)
|
| 331 |
ELASTICSEARCH.deleteByQuery(
|
| 332 |
-
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
|
|
|
|
| 333 |
return get_result()
|
| 334 |
|
| 335 |
|
| 336 |
-
@manager.route(
|
| 337 |
@token_required
|
| 338 |
-
def list_chunks(tenant_id,dataset_id,document_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 340 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 341 |
-
doc=DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 342 |
if not doc:
|
| 343 |
-
return get_error_data_result(
|
| 344 |
-
|
|
|
|
|
|
|
| 345 |
req = request.args
|
| 346 |
doc_id = document_id
|
| 347 |
page = int(req.get("offset", 1))
|
| 348 |
size = int(req.get("limit", 30))
|
| 349 |
question = req.get("keywords", "")
|
| 350 |
query = {
|
| 351 |
-
"doc_ids": [doc_id],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
}
|
| 353 |
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
| 354 |
key_mapping = {
|
| 355 |
"chunk_num": "chunk_count",
|
| 356 |
"kb_id": "dataset_id",
|
| 357 |
"token_num": "token_count",
|
| 358 |
-
"parser_id": "chunk_method"
|
| 359 |
}
|
| 360 |
run_mapping = {
|
| 361 |
"0": "UNSTART",
|
| 362 |
"1": "RUNNING",
|
| 363 |
"2": "CANCEL",
|
| 364 |
"3": "DONE",
|
| 365 |
-
"4": "FAIL"
|
| 366 |
}
|
| 367 |
-
doc=doc.to_dict()
|
| 368 |
renamed_doc = {}
|
| 369 |
for key, value in doc.items():
|
| 370 |
new_key = key_mapping.get(key, key)
|
|
@@ -377,21 +839,30 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
|
| 377 |
for id in sres.ids:
|
| 378 |
d = {
|
| 379 |
"chunk_id": id,
|
| 380 |
-
"content_with_weight":
|
| 381 |
-
id]
|
| 382 |
-
|
|
|
|
|
|
|
| 383 |
"doc_id": sres.field[id]["doc_id"],
|
| 384 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 385 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
| 386 |
"img_id": sres.field[id].get("img_id", ""),
|
| 387 |
"available_int": sres.field[id].get("available_int", 1),
|
| 388 |
-
"positions": sres.field[id].get("position_int", "").split("\t")
|
| 389 |
}
|
| 390 |
if len(d["positions"]) % 5 == 0:
|
| 391 |
poss = []
|
| 392 |
for i in range(0, len(d["positions"]), 5):
|
| 393 |
-
poss.append(
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
d["positions"] = poss
|
| 396 |
|
| 397 |
origin_chunks.append(d)
|
|
@@ -411,7 +882,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
|
| 411 |
"doc_id": "document_id",
|
| 412 |
"important_kwd": "important_keywords",
|
| 413 |
"img_id": "image_id",
|
| 414 |
-
"available_int":"available"
|
| 415 |
}
|
| 416 |
renamed_chunk = {}
|
| 417 |
for key, value in chunk.items():
|
|
@@ -425,31 +896,104 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
|
| 425 |
return get_result(data=res)
|
| 426 |
|
| 427 |
|
| 428 |
-
|
| 429 |
-
|
|
|
|
| 430 |
@token_required
|
| 431 |
-
def add_chunk(tenant_id,dataset_id,document_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 433 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 434 |
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 435 |
if not doc:
|
| 436 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 437 |
doc = doc[0]
|
| 438 |
req = request.json
|
| 439 |
if not req.get("content"):
|
| 440 |
return get_error_data_result(retmsg="`content` is required")
|
| 441 |
if "important_keywords" in req:
|
| 442 |
if type(req["important_keywords"]) != list:
|
| 443 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 444 |
md5 = hashlib.md5()
|
| 445 |
md5.update((req["content"] + document_id).encode("utf-8"))
|
| 446 |
|
| 447 |
chunk_id = md5.hexdigest()
|
| 448 |
-
d = {
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
| 450 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 451 |
d["important_kwd"] = req.get("important_keywords", [])
|
| 452 |
-
d["important_tks"] = rag_tokenizer.tokenize(
|
|
|
|
|
|
|
| 453 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 454 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 455 |
d["kb_id"] = [doc.kb_id]
|
|
@@ -457,17 +1001,17 @@ def add_chunk(tenant_id,dataset_id,document_id):
|
|
| 457 |
d["doc_id"] = doc.id
|
| 458 |
embd_id = DocumentService.get_embd_id(document_id)
|
| 459 |
embd_mdl = TenantLLMService.model_instance(
|
| 460 |
-
tenant_id, LLMType.EMBEDDING.value, embd_id
|
| 461 |
-
|
|
|
|
| 462 |
v, c = embd_mdl.encode([doc.name, req["content"]])
|
| 463 |
v = 0.1 * v[0] + 0.9 * v[1]
|
| 464 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 465 |
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
|
| 466 |
|
| 467 |
-
DocumentService.increment_chunk_num(
|
| 468 |
-
doc.id, doc.kb_id, c, 1, 0)
|
| 469 |
d["chunk_id"] = chunk_id
|
| 470 |
-
d["kb_id"]=doc.kb_id
|
| 471 |
# rename keys
|
| 472 |
key_mapping = {
|
| 473 |
"chunk_id": "id",
|
|
@@ -477,7 +1021,7 @@ def add_chunk(tenant_id,dataset_id,document_id):
|
|
| 477 |
"kb_id": "dataset_id",
|
| 478 |
"create_timestamp_flt": "create_timestamp",
|
| 479 |
"create_time": "create_time",
|
| 480 |
-
"document_keyword": "document"
|
| 481 |
}
|
| 482 |
renamed_chunk = {}
|
| 483 |
for key, value in d.items():
|
|
@@ -488,32 +1032,79 @@ def add_chunk(tenant_id,dataset_id,document_id):
|
|
| 488 |
# return get_result(data={"chunk_id": chunk_id})
|
| 489 |
|
| 490 |
|
| 491 |
-
@manager.route(
|
|
|
|
|
|
|
| 492 |
@token_required
|
| 493 |
-
def rm_chunk(tenant_id,dataset_id,document_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 495 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 496 |
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 497 |
if not doc:
|
| 498 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 499 |
doc = doc[0]
|
| 500 |
req = request.json
|
| 501 |
-
|
| 502 |
-
|
|
|
|
| 503 |
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
| 504 |
if not req:
|
| 505 |
-
chunk_ids=None
|
| 506 |
else:
|
| 507 |
-
chunk_ids=req.get("chunk_ids")
|
| 508 |
if not chunk_ids:
|
| 509 |
-
chunk_list=sres.ids
|
| 510 |
else:
|
| 511 |
-
chunk_list=chunk_ids
|
| 512 |
for chunk_id in chunk_list:
|
| 513 |
if chunk_id not in sres.ids:
|
| 514 |
return get_error_data_result(f"Chunk {chunk_id} not found")
|
| 515 |
if not ELASTICSEARCH.deleteByQuery(
|
| 516 |
-
|
|
|
|
| 517 |
return get_error_data_result(retmsg="Index updating failure")
|
| 518 |
deleted_chunk_ids = chunk_list
|
| 519 |
chunk_number = len(deleted_chunk_ids)
|
|
@@ -521,37 +1112,92 @@ def rm_chunk(tenant_id,dataset_id,document_id):
|
|
| 521 |
return get_result()
|
| 522 |
|
| 523 |
|
| 524 |
-
|
| 525 |
-
|
|
|
|
| 526 |
@token_required
|
| 527 |
-
def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
try:
|
| 529 |
-
res = ELASTICSEARCH.get(
|
| 530 |
-
chunk_id, search.index_name(
|
| 531 |
-
tenant_id))
|
| 532 |
except Exception as e:
|
| 533 |
return get_error_data_result(f"Can't find this chunk {chunk_id}")
|
| 534 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 535 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 536 |
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 537 |
if not doc:
|
| 538 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 539 |
doc = doc[0]
|
| 540 |
query = {
|
| 541 |
-
"doc_ids": [document_id],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
}
|
| 543 |
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
| 544 |
if chunk_id not in sres.ids:
|
| 545 |
return get_error_data_result(f"You don't own the chunk {chunk_id}")
|
| 546 |
req = request.json
|
| 547 |
-
content=res["_source"].get("content_with_weight")
|
| 548 |
-
d = {
|
| 549 |
-
"id": chunk_id,
|
| 550 |
-
"content_with_weight": req.get("content",content)}
|
| 551 |
d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
|
| 552 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 553 |
if "important_keywords" in req:
|
| 554 |
-
if not isinstance(req["important_keywords"],list):
|
| 555 |
return get_error_data_result("`important_keywords` should be a list")
|
| 556 |
d["important_kwd"] = req.get("important_keywords")
|
| 557 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
|
|
@@ -559,18 +1205,18 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
|
| 559 |
d["available_int"] = int(req["available"])
|
| 560 |
embd_id = DocumentService.get_embd_id(document_id)
|
| 561 |
embd_mdl = TenantLLMService.model_instance(
|
| 562 |
-
tenant_id, LLMType.EMBEDDING.value, embd_id
|
|
|
|
| 563 |
if doc.parser_id == ParserType.QA:
|
| 564 |
-
arr = [
|
| 565 |
-
t for t in re.split(
|
| 566 |
-
r"[\n\t]",
|
| 567 |
-
d["content_with_weight"]) if len(t) > 1]
|
| 568 |
if len(arr) != 2:
|
| 569 |
return get_error_data_result(
|
| 570 |
-
retmsg="Q&A must be separated by TAB/ENTER key."
|
|
|
|
| 571 |
q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
|
| 572 |
-
d = beAdoc(
|
| 573 |
-
[rag_tokenizer.is_chinese(t) for t in q + a])
|
|
|
|
| 574 |
|
| 575 |
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
|
| 576 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
|
@@ -579,41 +1225,120 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
|
| 579 |
return get_result()
|
| 580 |
|
| 581 |
|
| 582 |
-
|
| 583 |
-
@manager.route('/retrieval', methods=['POST'])
|
| 584 |
@token_required
|
| 585 |
def retrieval_test(tenant_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
req = request.json
|
| 587 |
if not req.get("dataset_ids"):
|
| 588 |
return get_error_data_result("`dataset_ids` is required.")
|
| 589 |
kb_ids = req["dataset_ids"]
|
| 590 |
-
if not isinstance(kb_ids,list):
|
| 591 |
return get_error_data_result("`dataset_ids` should be a list")
|
| 592 |
kbs = KnowledgebaseService.get_by_ids(kb_ids)
|
| 593 |
for id in kb_ids:
|
| 594 |
-
if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
|
| 595 |
return get_error_data_result(f"You don't own the dataset {id}.")
|
| 596 |
embd_nms = list(set([kb.embd_id for kb in kbs]))
|
| 597 |
if len(embd_nms) != 1:
|
| 598 |
return get_result(
|
| 599 |
retmsg='Datasets use different embedding models."',
|
| 600 |
-
retcode=RetCode.AUTHENTICATION_ERROR
|
|
|
|
| 601 |
if "question" not in req:
|
| 602 |
return get_error_data_result("`question` is required.")
|
| 603 |
page = int(req.get("offset", 1))
|
| 604 |
size = int(req.get("limit", 1024))
|
| 605 |
question = req["question"]
|
| 606 |
doc_ids = req.get("document_ids", [])
|
| 607 |
-
if not isinstance(doc_ids,list):
|
| 608 |
return get_error_data_result("`documents` should be a list")
|
| 609 |
-
doc_ids_list=KnowledgebaseService.list_documents_by_ids(kb_ids)
|
| 610 |
for doc_id in doc_ids:
|
| 611 |
if doc_id not in doc_ids_list:
|
| 612 |
-
return get_error_data_result(
|
|
|
|
|
|
|
| 613 |
similarity_threshold = float(req.get("similarity_threshold", 0.2))
|
| 614 |
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
|
| 615 |
top = int(req.get("top_k", 1024))
|
| 616 |
-
if req.get("highlight")=="False" or
|
| 617 |
highlight = False
|
| 618 |
else:
|
| 619 |
highlight = True
|
|
@@ -622,21 +1347,34 @@ def retrieval_test(tenant_id):
|
|
| 622 |
if not e:
|
| 623 |
return get_error_data_result(retmsg="Dataset not found!")
|
| 624 |
embd_mdl = TenantLLMService.model_instance(
|
| 625 |
-
kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id
|
|
|
|
| 626 |
|
| 627 |
rerank_mdl = None
|
| 628 |
if req.get("rerank_id"):
|
| 629 |
rerank_mdl = TenantLLMService.model_instance(
|
| 630 |
-
kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]
|
|
|
|
| 631 |
|
| 632 |
if req.get("keyword", False):
|
| 633 |
chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
|
| 634 |
question += keyword_extraction(chat_mdl, question)
|
| 635 |
|
| 636 |
retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
|
| 637 |
-
ranks = retr.retrieval(
|
| 638 |
-
|
| 639 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
for c in ranks["chunks"]:
|
| 641 |
if "vector" in c:
|
| 642 |
del c["vector"]
|
|
@@ -649,7 +1387,7 @@ def retrieval_test(tenant_id):
|
|
| 649 |
"content_with_weight": "content",
|
| 650 |
"doc_id": "document_id",
|
| 651 |
"important_kwd": "important_keywords",
|
| 652 |
-
"docnm_kwd": "document_keyword"
|
| 653 |
}
|
| 654 |
rename_chunk = {}
|
| 655 |
for key, value in chunk.items():
|
|
@@ -660,6 +1398,8 @@ def retrieval_test(tenant_id):
|
|
| 660 |
return get_result(data=ranks)
|
| 661 |
except Exception as e:
|
| 662 |
if str(e).find("not_found") > 0:
|
| 663 |
-
return get_result(
|
| 664 |
-
|
| 665 |
-
|
|
|
|
|
|
|
|
|
| 39 |
from api.db.services.file_service import FileService
|
| 40 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
| 41 |
from api.settings import RetCode, retrievaler
|
| 42 |
+
from api.utils.api_utils import construct_json_result, get_parser_config
|
| 43 |
from rag.nlp import search
|
| 44 |
from rag.utils import rmSpace
|
| 45 |
from rag.utils.es_conn import ELASTICSEARCH
|
|
|
|
| 49 |
MAXIMUM_OF_UPLOADING_FILES = 256
|
| 50 |
|
| 51 |
|
| 52 |
+
@manager.route("/datasets/<dataset_id>/documents", methods=["POST"])
|
|
|
|
| 53 |
@token_required
|
| 54 |
def upload(dataset_id, tenant_id):
|
| 55 |
+
"""
|
| 56 |
+
Upload documents to a dataset.
|
| 57 |
+
---
|
| 58 |
+
tags:
|
| 59 |
+
- Documents
|
| 60 |
+
security:
|
| 61 |
+
- ApiKeyAuth: []
|
| 62 |
+
parameters:
|
| 63 |
+
- in: path
|
| 64 |
+
name: dataset_id
|
| 65 |
+
type: string
|
| 66 |
+
required: true
|
| 67 |
+
description: ID of the dataset.
|
| 68 |
+
- in: header
|
| 69 |
+
name: Authorization
|
| 70 |
+
type: string
|
| 71 |
+
required: true
|
| 72 |
+
description: Bearer token for authentication.
|
| 73 |
+
- in: formData
|
| 74 |
+
name: file
|
| 75 |
+
type: file
|
| 76 |
+
required: true
|
| 77 |
+
description: Document files to upload.
|
| 78 |
+
responses:
|
| 79 |
+
200:
|
| 80 |
+
description: Successfully uploaded documents.
|
| 81 |
+
schema:
|
| 82 |
+
type: object
|
| 83 |
+
properties:
|
| 84 |
+
data:
|
| 85 |
+
type: array
|
| 86 |
+
items:
|
| 87 |
+
type: object
|
| 88 |
+
properties:
|
| 89 |
+
id:
|
| 90 |
+
type: string
|
| 91 |
+
description: Document ID.
|
| 92 |
+
name:
|
| 93 |
+
type: string
|
| 94 |
+
description: Document name.
|
| 95 |
+
chunk_count:
|
| 96 |
+
type: integer
|
| 97 |
+
description: Number of chunks.
|
| 98 |
+
token_count:
|
| 99 |
+
type: integer
|
| 100 |
+
description: Number of tokens.
|
| 101 |
+
dataset_id:
|
| 102 |
+
type: string
|
| 103 |
+
description: ID of the dataset.
|
| 104 |
+
chunk_method:
|
| 105 |
+
type: string
|
| 106 |
+
description: Chunking method used.
|
| 107 |
+
run:
|
| 108 |
+
type: string
|
| 109 |
+
description: Processing status.
|
| 110 |
+
"""
|
| 111 |
+
if "file" not in request.files:
|
| 112 |
return get_error_data_result(
|
| 113 |
+
retmsg="No file part!", retcode=RetCode.ARGUMENT_ERROR
|
| 114 |
+
)
|
| 115 |
+
file_objs = request.files.getlist("file")
|
| 116 |
for file_obj in file_objs:
|
| 117 |
+
if file_obj.filename == "":
|
| 118 |
return get_result(
|
| 119 |
+
retmsg="No file selected!", retcode=RetCode.ARGUMENT_ERROR
|
| 120 |
+
)
|
| 121 |
# total size
|
| 122 |
total_size = 0
|
| 123 |
for file_obj in file_objs:
|
| 124 |
file_obj.seek(0, os.SEEK_END)
|
| 125 |
total_size += file_obj.tell()
|
| 126 |
file_obj.seek(0)
|
| 127 |
+
MAX_TOTAL_FILE_SIZE = 10 * 1024 * 1024
|
| 128 |
if total_size > MAX_TOTAL_FILE_SIZE:
|
| 129 |
return get_result(
|
| 130 |
+
retmsg=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
|
| 131 |
+
retcode=RetCode.ARGUMENT_ERROR,
|
| 132 |
+
)
|
| 133 |
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
| 134 |
if not e:
|
| 135 |
raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
|
| 136 |
+
err, files = FileService.upload_document(kb, file_objs, tenant_id)
|
| 137 |
if err:
|
| 138 |
+
return get_result(retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
|
|
|
| 139 |
# rename key's name
|
| 140 |
renamed_doc_list = []
|
| 141 |
for file in files:
|
|
|
|
| 144 |
"chunk_num": "chunk_count",
|
| 145 |
"kb_id": "dataset_id",
|
| 146 |
"token_num": "token_count",
|
| 147 |
+
"parser_id": "chunk_method",
|
| 148 |
}
|
| 149 |
renamed_doc = {}
|
| 150 |
for key, value in doc.items():
|
|
|
|
| 155 |
return get_result(data=renamed_doc_list)
|
| 156 |
|
| 157 |
|
| 158 |
+
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"])
|
| 159 |
@token_required
|
| 160 |
def update_doc(tenant_id, dataset_id, document_id):
|
| 161 |
+
"""
|
| 162 |
+
Update a document within a dataset.
|
| 163 |
+
---
|
| 164 |
+
tags:
|
| 165 |
+
- Documents
|
| 166 |
+
security:
|
| 167 |
+
- ApiKeyAuth: []
|
| 168 |
+
parameters:
|
| 169 |
+
- in: path
|
| 170 |
+
name: dataset_id
|
| 171 |
+
type: string
|
| 172 |
+
required: true
|
| 173 |
+
description: ID of the dataset.
|
| 174 |
+
- in: path
|
| 175 |
+
name: document_id
|
| 176 |
+
type: string
|
| 177 |
+
required: true
|
| 178 |
+
description: ID of the document to update.
|
| 179 |
+
- in: header
|
| 180 |
+
name: Authorization
|
| 181 |
+
type: string
|
| 182 |
+
required: true
|
| 183 |
+
description: Bearer token for authentication.
|
| 184 |
+
- in: body
|
| 185 |
+
name: body
|
| 186 |
+
description: Document update parameters.
|
| 187 |
+
required: true
|
| 188 |
+
schema:
|
| 189 |
+
type: object
|
| 190 |
+
properties:
|
| 191 |
+
name:
|
| 192 |
+
type: string
|
| 193 |
+
description: New name of the document.
|
| 194 |
+
parser_config:
|
| 195 |
+
type: object
|
| 196 |
+
description: Parser configuration.
|
| 197 |
+
chunk_method:
|
| 198 |
+
type: string
|
| 199 |
+
description: Chunking method.
|
| 200 |
+
responses:
|
| 201 |
+
200:
|
| 202 |
+
description: Document updated successfully.
|
| 203 |
+
schema:
|
| 204 |
+
type: object
|
| 205 |
+
"""
|
| 206 |
req = request.json
|
| 207 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 208 |
return get_error_data_result(retmsg="You don't own the dataset.")
|
|
|
|
| 217 |
if req["token_count"] != doc.token_num:
|
| 218 |
return get_error_data_result(retmsg="Can't change `token_count`.")
|
| 219 |
if "progress" in req:
|
| 220 |
+
if req["progress"] != doc.progress:
|
| 221 |
return get_error_data_result(retmsg="Can't change `progress`.")
|
| 222 |
|
| 223 |
if "name" in req and req["name"] != doc.name:
|
| 224 |
+
if (
|
| 225 |
+
pathlib.Path(req["name"].lower()).suffix
|
| 226 |
+
!= pathlib.Path(doc.name.lower()).suffix
|
| 227 |
+
):
|
| 228 |
+
return get_result(
|
| 229 |
+
retmsg="The extension of file can't be changed",
|
| 230 |
+
retcode=RetCode.ARGUMENT_ERROR,
|
| 231 |
+
)
|
| 232 |
for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
|
| 233 |
if d.name == req["name"]:
|
| 234 |
return get_error_data_result(
|
| 235 |
+
retmsg="Duplicated document name in the same dataset."
|
| 236 |
+
)
|
| 237 |
+
if not DocumentService.update_by_id(document_id, {"name": req["name"]}):
|
| 238 |
+
return get_error_data_result(retmsg="Database error (Document rename)!")
|
|
|
|
| 239 |
|
| 240 |
informs = File2DocumentService.get_by_document_id(document_id)
|
| 241 |
if informs:
|
|
|
|
| 244 |
if "parser_config" in req:
|
| 245 |
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
| 246 |
if "chunk_method" in req:
|
| 247 |
+
valid_chunk_method = {
|
| 248 |
+
"naive",
|
| 249 |
+
"manual",
|
| 250 |
+
"qa",
|
| 251 |
+
"table",
|
| 252 |
+
"paper",
|
| 253 |
+
"book",
|
| 254 |
+
"laws",
|
| 255 |
+
"presentation",
|
| 256 |
+
"picture",
|
| 257 |
+
"one",
|
| 258 |
+
"knowledge_graph",
|
| 259 |
+
"email",
|
| 260 |
+
}
|
| 261 |
if req.get("chunk_method") not in valid_chunk_method:
|
| 262 |
+
return get_error_data_result(
|
| 263 |
+
f"`chunk_method` {req['chunk_method']} doesn't exist"
|
| 264 |
+
)
|
| 265 |
if doc.parser_id.lower() == req["chunk_method"].lower():
|
| 266 |
+
return get_result()
|
| 267 |
|
| 268 |
+
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
|
|
|
|
| 269 |
return get_error_data_result(retmsg="Not supported yet!")
|
| 270 |
|
| 271 |
+
e = DocumentService.update_by_id(
|
| 272 |
+
doc.id,
|
| 273 |
+
{
|
| 274 |
+
"parser_id": req["chunk_method"],
|
| 275 |
+
"progress": 0,
|
| 276 |
+
"progress_msg": "",
|
| 277 |
+
"run": TaskStatus.UNSTART.value,
|
| 278 |
+
},
|
| 279 |
+
)
|
| 280 |
if not e:
|
| 281 |
return get_error_data_result(retmsg="Document not found!")
|
| 282 |
+
req["parser_config"] = get_parser_config(
|
| 283 |
+
req["chunk_method"], req.get("parser_config")
|
| 284 |
+
)
|
| 285 |
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
| 286 |
if doc.token_num > 0:
|
| 287 |
+
e = DocumentService.increment_chunk_num(
|
| 288 |
+
doc.id,
|
| 289 |
+
doc.kb_id,
|
| 290 |
+
doc.token_num * -1,
|
| 291 |
+
doc.chunk_num * -1,
|
| 292 |
+
doc.process_duation * -1,
|
| 293 |
+
)
|
| 294 |
if not e:
|
| 295 |
return get_error_data_result(retmsg="Document not found!")
|
| 296 |
ELASTICSEARCH.deleteByQuery(
|
| 297 |
+
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)
|
| 298 |
+
)
|
| 299 |
|
| 300 |
return get_result()
|
| 301 |
|
| 302 |
|
| 303 |
+
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])
|
| 304 |
@token_required
|
| 305 |
def download(tenant_id, dataset_id, document_id):
|
| 306 |
+
"""
|
| 307 |
+
Download a document from a dataset.
|
| 308 |
+
---
|
| 309 |
+
tags:
|
| 310 |
+
- Documents
|
| 311 |
+
security:
|
| 312 |
+
- ApiKeyAuth: []
|
| 313 |
+
produces:
|
| 314 |
+
- application/octet-stream
|
| 315 |
+
parameters:
|
| 316 |
+
- in: path
|
| 317 |
+
name: dataset_id
|
| 318 |
+
type: string
|
| 319 |
+
required: true
|
| 320 |
+
description: ID of the dataset.
|
| 321 |
+
- in: path
|
| 322 |
+
name: document_id
|
| 323 |
+
type: string
|
| 324 |
+
required: true
|
| 325 |
+
description: ID of the document to download.
|
| 326 |
+
- in: header
|
| 327 |
+
name: Authorization
|
| 328 |
+
type: string
|
| 329 |
+
required: true
|
| 330 |
+
description: Bearer token for authentication.
|
| 331 |
+
responses:
|
| 332 |
+
200:
|
| 333 |
+
description: Document file stream.
|
| 334 |
+
schema:
|
| 335 |
+
type: file
|
| 336 |
+
400:
|
| 337 |
+
description: Error message.
|
| 338 |
+
schema:
|
| 339 |
+
type: object
|
| 340 |
+
"""
|
| 341 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 342 |
+
return get_error_data_result(retmsg=f"You do not own the dataset {dataset_id}.")
|
| 343 |
doc = DocumentService.query(kb_id=dataset_id, id=document_id)
|
| 344 |
if not doc:
|
| 345 |
+
return get_error_data_result(
|
| 346 |
+
retmsg=f"The dataset not own the document {document_id}."
|
| 347 |
+
)
|
| 348 |
# The process of downloading
|
| 349 |
+
doc_id, doc_location = File2DocumentService.get_storage_address(
|
| 350 |
+
doc_id=document_id
|
| 351 |
+
) # minio address
|
| 352 |
file_stream = STORAGE_IMPL.get(doc_id, doc_location)
|
| 353 |
if not file_stream:
|
| 354 |
+
return construct_json_result(
|
| 355 |
+
message="This file is empty.", code=RetCode.DATA_ERROR
|
| 356 |
+
)
|
| 357 |
file = BytesIO(file_stream)
|
| 358 |
# Use send_file with a proper filename and MIME type
|
| 359 |
return send_file(
|
| 360 |
file,
|
| 361 |
as_attachment=True,
|
| 362 |
download_name=doc[0].name,
|
| 363 |
+
mimetype="application/octet-stream", # Set a default MIME type
|
| 364 |
)
|
| 365 |
|
| 366 |
|
| 367 |
+
@manager.route("/datasets/<dataset_id>/documents", methods=["GET"])
|
| 368 |
@token_required
|
| 369 |
def list_docs(dataset_id, tenant_id):
|
| 370 |
+
"""
|
| 371 |
+
List documents in a dataset.
|
| 372 |
+
---
|
| 373 |
+
tags:
|
| 374 |
+
- Documents
|
| 375 |
+
security:
|
| 376 |
+
- ApiKeyAuth: []
|
| 377 |
+
parameters:
|
| 378 |
+
- in: path
|
| 379 |
+
name: dataset_id
|
| 380 |
+
type: string
|
| 381 |
+
required: true
|
| 382 |
+
description: ID of the dataset.
|
| 383 |
+
- in: query
|
| 384 |
+
name: id
|
| 385 |
+
type: string
|
| 386 |
+
required: false
|
| 387 |
+
description: Filter by document ID.
|
| 388 |
+
- in: query
|
| 389 |
+
name: offset
|
| 390 |
+
type: integer
|
| 391 |
+
required: false
|
| 392 |
+
default: 1
|
| 393 |
+
description: Page number.
|
| 394 |
+
- in: query
|
| 395 |
+
name: limit
|
| 396 |
+
type: integer
|
| 397 |
+
required: false
|
| 398 |
+
default: 1024
|
| 399 |
+
description: Number of items per page.
|
| 400 |
+
- in: query
|
| 401 |
+
name: orderby
|
| 402 |
+
type: string
|
| 403 |
+
required: false
|
| 404 |
+
default: "create_time"
|
| 405 |
+
description: Field to order by.
|
| 406 |
+
- in: query
|
| 407 |
+
name: desc
|
| 408 |
+
type: boolean
|
| 409 |
+
required: false
|
| 410 |
+
default: true
|
| 411 |
+
description: Order in descending.
|
| 412 |
+
- in: header
|
| 413 |
+
name: Authorization
|
| 414 |
+
type: string
|
| 415 |
+
required: true
|
| 416 |
+
description: Bearer token for authentication.
|
| 417 |
+
responses:
|
| 418 |
+
200:
|
| 419 |
+
description: List of documents.
|
| 420 |
+
schema:
|
| 421 |
+
type: object
|
| 422 |
+
properties:
|
| 423 |
+
total:
|
| 424 |
+
type: integer
|
| 425 |
+
description: Total number of documents.
|
| 426 |
+
docs:
|
| 427 |
+
type: array
|
| 428 |
+
items:
|
| 429 |
+
type: object
|
| 430 |
+
properties:
|
| 431 |
+
id:
|
| 432 |
+
type: string
|
| 433 |
+
description: Document ID.
|
| 434 |
+
name:
|
| 435 |
+
type: string
|
| 436 |
+
description: Document name.
|
| 437 |
+
chunk_count:
|
| 438 |
+
type: integer
|
| 439 |
+
description: Number of chunks.
|
| 440 |
+
token_count:
|
| 441 |
+
type: integer
|
| 442 |
+
description: Number of tokens.
|
| 443 |
+
dataset_id:
|
| 444 |
+
type: string
|
| 445 |
+
description: ID of the dataset.
|
| 446 |
+
chunk_method:
|
| 447 |
+
type: string
|
| 448 |
+
description: Chunking method used.
|
| 449 |
+
run:
|
| 450 |
+
type: string
|
| 451 |
+
description: Processing status.
|
| 452 |
+
"""
|
| 453 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 454 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
|
| 455 |
id = request.args.get("id")
|
| 456 |
name = request.args.get("name")
|
| 457 |
+
if not DocumentService.query(id=id, kb_id=dataset_id):
|
| 458 |
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
| 459 |
+
if not DocumentService.query(name=name, kb_id=dataset_id):
|
| 460 |
return get_error_data_result(retmsg=f"You don't own the document {name}.")
|
| 461 |
offset = int(request.args.get("offset", 1))
|
| 462 |
+
keywords = request.args.get("keywords", "")
|
| 463 |
limit = int(request.args.get("limit", 1024))
|
| 464 |
orderby = request.args.get("orderby", "create_time")
|
| 465 |
if request.args.get("desc") == "False":
|
| 466 |
desc = False
|
| 467 |
else:
|
| 468 |
desc = True
|
| 469 |
+
docs, tol = DocumentService.get_list(
|
| 470 |
+
dataset_id, offset, limit, orderby, desc, keywords, id, name
|
| 471 |
+
)
|
| 472 |
|
| 473 |
# rename key's name
|
| 474 |
renamed_doc_list = []
|
|
|
|
| 477 |
"chunk_num": "chunk_count",
|
| 478 |
"kb_id": "dataset_id",
|
| 479 |
"token_num": "token_count",
|
| 480 |
+
"parser_id": "chunk_method",
|
| 481 |
}
|
| 482 |
run_mapping = {
|
| 483 |
+
"0": "UNSTART",
|
| 484 |
+
"1": "RUNNING",
|
| 485 |
+
"2": "CANCEL",
|
| 486 |
+
"3": "DONE",
|
| 487 |
+
"4": "FAIL",
|
| 488 |
}
|
| 489 |
renamed_doc = {}
|
| 490 |
for key, value in doc.items():
|
| 491 |
+
if key == "run":
|
| 492 |
+
renamed_doc["run"] = run_mapping.get(str(value))
|
| 493 |
new_key = key_mapping.get(key, key)
|
| 494 |
renamed_doc[new_key] = value
|
| 495 |
+
if key == "run":
|
| 496 |
+
renamed_doc["run"] = run_mapping.get(value)
|
| 497 |
renamed_doc_list.append(renamed_doc)
|
| 498 |
return get_result(data={"total": tol, "docs": renamed_doc_list})
|
| 499 |
|
| 500 |
|
| 501 |
+
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"])
|
| 502 |
@token_required
|
| 503 |
+
def delete(tenant_id, dataset_id):
|
| 504 |
+
"""
|
| 505 |
+
Delete documents from a dataset.
|
| 506 |
+
---
|
| 507 |
+
tags:
|
| 508 |
+
- Documents
|
| 509 |
+
security:
|
| 510 |
+
- ApiKeyAuth: []
|
| 511 |
+
parameters:
|
| 512 |
+
- in: path
|
| 513 |
+
name: dataset_id
|
| 514 |
+
type: string
|
| 515 |
+
required: true
|
| 516 |
+
description: ID of the dataset.
|
| 517 |
+
- in: body
|
| 518 |
+
name: body
|
| 519 |
+
description: Document deletion parameters.
|
| 520 |
+
required: true
|
| 521 |
+
schema:
|
| 522 |
+
type: object
|
| 523 |
+
properties:
|
| 524 |
+
ids:
|
| 525 |
+
type: array
|
| 526 |
+
items:
|
| 527 |
+
type: string
|
| 528 |
+
description: List of document IDs to delete.
|
| 529 |
+
- in: header
|
| 530 |
+
name: Authorization
|
| 531 |
+
type: string
|
| 532 |
+
required: true
|
| 533 |
+
description: Bearer token for authentication.
|
| 534 |
+
responses:
|
| 535 |
+
200:
|
| 536 |
+
description: Documents deleted successfully.
|
| 537 |
+
schema:
|
| 538 |
+
type: object
|
| 539 |
+
"""
|
| 540 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 541 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
|
| 542 |
req = request.json
|
| 543 |
if not req:
|
| 544 |
+
doc_ids = None
|
| 545 |
else:
|
| 546 |
+
doc_ids = req.get("ids")
|
| 547 |
if not doc_ids:
|
| 548 |
doc_list = []
|
| 549 |
+
docs = DocumentService.query(kb_id=dataset_id)
|
| 550 |
for doc in docs:
|
| 551 |
doc_list.append(doc.id)
|
| 552 |
else:
|
| 553 |
+
doc_list = doc_ids
|
| 554 |
root_folder = FileService.get_root_folder(tenant_id)
|
| 555 |
pf_id = root_folder["id"]
|
| 556 |
FileService.init_knowledgebase_docs(pf_id, tenant_id)
|
|
|
|
| 568 |
|
| 569 |
if not DocumentService.remove_document(doc, tenant_id):
|
| 570 |
return get_error_data_result(
|
| 571 |
+
retmsg="Database error (Document removal)!"
|
| 572 |
+
)
|
| 573 |
|
| 574 |
f2d = File2DocumentService.get_by_document_id(doc_id)
|
| 575 |
+
FileService.filter_delete(
|
| 576 |
+
[
|
| 577 |
+
File.source_type == FileSource.KNOWLEDGEBASE,
|
| 578 |
+
File.id == f2d[0].file_id,
|
| 579 |
+
]
|
| 580 |
+
)
|
| 581 |
File2DocumentService.delete_by_document_id(doc_id)
|
| 582 |
|
| 583 |
STORAGE_IMPL.rm(b, n)
|
|
|
|
| 590 |
return get_result()
|
| 591 |
|
| 592 |
|
| 593 |
+
@manager.route("/datasets/<dataset_id>/chunks", methods=["POST"])
|
| 594 |
@token_required
|
| 595 |
+
def parse(tenant_id, dataset_id):
|
| 596 |
+
"""
|
| 597 |
+
Start parsing documents into chunks.
|
| 598 |
+
---
|
| 599 |
+
tags:
|
| 600 |
+
- Chunks
|
| 601 |
+
security:
|
| 602 |
+
- ApiKeyAuth: []
|
| 603 |
+
parameters:
|
| 604 |
+
- in: path
|
| 605 |
+
name: dataset_id
|
| 606 |
+
type: string
|
| 607 |
+
required: true
|
| 608 |
+
description: ID of the dataset.
|
| 609 |
+
- in: body
|
| 610 |
+
name: body
|
| 611 |
+
description: Parsing parameters.
|
| 612 |
+
required: true
|
| 613 |
+
schema:
|
| 614 |
+
type: object
|
| 615 |
+
properties:
|
| 616 |
+
document_ids:
|
| 617 |
+
type: array
|
| 618 |
+
items:
|
| 619 |
+
type: string
|
| 620 |
+
description: List of document IDs to parse.
|
| 621 |
+
- in: header
|
| 622 |
+
name: Authorization
|
| 623 |
+
type: string
|
| 624 |
+
required: true
|
| 625 |
+
description: Bearer token for authentication.
|
| 626 |
+
responses:
|
| 627 |
+
200:
|
| 628 |
+
description: Parsing started successfully.
|
| 629 |
+
schema:
|
| 630 |
+
type: object
|
| 631 |
+
"""
|
| 632 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 633 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 634 |
req = request.json
|
| 635 |
if not req.get("document_ids"):
|
| 636 |
return get_error_data_result("`document_ids` is required")
|
| 637 |
for id in req["document_ids"]:
|
| 638 |
+
doc = DocumentService.query(id=id, kb_id=dataset_id)
|
| 639 |
if not doc:
|
| 640 |
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
| 641 |
+
if doc[0].progress != 0.0:
|
| 642 |
+
return get_error_data_result(
|
| 643 |
+
"Can't stop parsing document with progress at 0 or 100"
|
| 644 |
+
)
|
| 645 |
info = {"run": "1", "progress": 0}
|
| 646 |
info["progress_msg"] = ""
|
| 647 |
info["chunk_num"] = 0
|
| 648 |
info["token_num"] = 0
|
| 649 |
DocumentService.update_by_id(id, info)
|
| 650 |
ELASTICSEARCH.deleteByQuery(
|
| 651 |
+
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
|
| 652 |
+
)
|
| 653 |
TaskService.filter_delete([Task.doc_id == id])
|
| 654 |
e, doc = DocumentService.get_by_id(id)
|
| 655 |
doc = doc.to_dict()
|
|
|
|
| 658 |
queue_tasks(doc, bucket, name)
|
| 659 |
return get_result()
|
| 660 |
|
| 661 |
+
|
| 662 |
+
@manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"])
|
| 663 |
@token_required
|
| 664 |
+
def stop_parsing(tenant_id, dataset_id):
|
| 665 |
+
"""
|
| 666 |
+
Stop parsing documents into chunks.
|
| 667 |
+
---
|
| 668 |
+
tags:
|
| 669 |
+
- Chunks
|
| 670 |
+
security:
|
| 671 |
+
- ApiKeyAuth: []
|
| 672 |
+
parameters:
|
| 673 |
+
- in: path
|
| 674 |
+
name: dataset_id
|
| 675 |
+
type: string
|
| 676 |
+
required: true
|
| 677 |
+
description: ID of the dataset.
|
| 678 |
+
- in: body
|
| 679 |
+
name: body
|
| 680 |
+
description: Stop parsing parameters.
|
| 681 |
+
required: true
|
| 682 |
+
schema:
|
| 683 |
+
type: object
|
| 684 |
+
properties:
|
| 685 |
+
document_ids:
|
| 686 |
+
type: array
|
| 687 |
+
items:
|
| 688 |
+
type: string
|
| 689 |
+
description: List of document IDs to stop parsing.
|
| 690 |
+
- in: header
|
| 691 |
+
name: Authorization
|
| 692 |
+
type: string
|
| 693 |
+
required: true
|
| 694 |
+
description: Bearer token for authentication.
|
| 695 |
+
responses:
|
| 696 |
+
200:
|
| 697 |
+
description: Parsing stopped successfully.
|
| 698 |
+
schema:
|
| 699 |
+
type: object
|
| 700 |
+
"""
|
| 701 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 702 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 703 |
req = request.json
|
|
|
|
| 708 |
if not doc:
|
| 709 |
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
| 710 |
if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
|
| 711 |
+
return get_error_data_result(
|
| 712 |
+
"Can't stop parsing document with progress at 0 or 1"
|
| 713 |
+
)
|
| 714 |
+
info = {"run": "2", "progress": 0, "chunk_num": 0}
|
| 715 |
DocumentService.update_by_id(id, info)
|
| 716 |
ELASTICSEARCH.deleteByQuery(
|
| 717 |
+
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
|
| 718 |
+
)
|
| 719 |
return get_result()
|
| 720 |
|
| 721 |
|
| 722 |
+
@manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"])
|
| 723 |
@token_required
|
| 724 |
+
def list_chunks(tenant_id, dataset_id, document_id):
|
| 725 |
+
"""
|
| 726 |
+
List chunks of a document.
|
| 727 |
+
---
|
| 728 |
+
tags:
|
| 729 |
+
- Chunks
|
| 730 |
+
security:
|
| 731 |
+
- ApiKeyAuth: []
|
| 732 |
+
parameters:
|
| 733 |
+
- in: path
|
| 734 |
+
name: dataset_id
|
| 735 |
+
type: string
|
| 736 |
+
required: true
|
| 737 |
+
description: ID of the dataset.
|
| 738 |
+
- in: path
|
| 739 |
+
name: document_id
|
| 740 |
+
type: string
|
| 741 |
+
required: true
|
| 742 |
+
description: ID of the document.
|
| 743 |
+
- in: query
|
| 744 |
+
name: offset
|
| 745 |
+
type: integer
|
| 746 |
+
required: false
|
| 747 |
+
default: 1
|
| 748 |
+
description: Page number.
|
| 749 |
+
- in: query
|
| 750 |
+
name: limit
|
| 751 |
+
type: integer
|
| 752 |
+
required: false
|
| 753 |
+
default: 30
|
| 754 |
+
description: Number of items per page.
|
| 755 |
+
- in: header
|
| 756 |
+
name: Authorization
|
| 757 |
+
type: string
|
| 758 |
+
required: true
|
| 759 |
+
description: Bearer token for authentication.
|
| 760 |
+
responses:
|
| 761 |
+
200:
|
| 762 |
+
description: List of chunks.
|
| 763 |
+
schema:
|
| 764 |
+
type: object
|
| 765 |
+
properties:
|
| 766 |
+
total:
|
| 767 |
+
type: integer
|
| 768 |
+
description: Total number of chunks.
|
| 769 |
+
chunks:
|
| 770 |
+
type: array
|
| 771 |
+
items:
|
| 772 |
+
type: object
|
| 773 |
+
properties:
|
| 774 |
+
id:
|
| 775 |
+
type: string
|
| 776 |
+
description: Chunk ID.
|
| 777 |
+
content:
|
| 778 |
+
type: string
|
| 779 |
+
description: Chunk content.
|
| 780 |
+
document_id:
|
| 781 |
+
type: string
|
| 782 |
+
description: ID of the document.
|
| 783 |
+
important_keywords:
|
| 784 |
+
type: array
|
| 785 |
+
items:
|
| 786 |
+
type: string
|
| 787 |
+
description: Important keywords.
|
| 788 |
+
image_id:
|
| 789 |
+
type: string
|
| 790 |
+
description: Image ID associated with the chunk.
|
| 791 |
+
doc:
|
| 792 |
+
type: object
|
| 793 |
+
description: Document details.
|
| 794 |
+
"""
|
| 795 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 796 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 797 |
+
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 798 |
if not doc:
|
| 799 |
+
return get_error_data_result(
|
| 800 |
+
retmsg=f"You don't own the document {document_id}."
|
| 801 |
+
)
|
| 802 |
+
doc = doc[0]
|
| 803 |
req = request.args
|
| 804 |
doc_id = document_id
|
| 805 |
page = int(req.get("offset", 1))
|
| 806 |
size = int(req.get("limit", 30))
|
| 807 |
question = req.get("keywords", "")
|
| 808 |
query = {
|
| 809 |
+
"doc_ids": [doc_id],
|
| 810 |
+
"page": page,
|
| 811 |
+
"size": size,
|
| 812 |
+
"question": question,
|
| 813 |
+
"sort": True,
|
| 814 |
}
|
| 815 |
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
| 816 |
key_mapping = {
|
| 817 |
"chunk_num": "chunk_count",
|
| 818 |
"kb_id": "dataset_id",
|
| 819 |
"token_num": "token_count",
|
| 820 |
+
"parser_id": "chunk_method",
|
| 821 |
}
|
| 822 |
run_mapping = {
|
| 823 |
"0": "UNSTART",
|
| 824 |
"1": "RUNNING",
|
| 825 |
"2": "CANCEL",
|
| 826 |
"3": "DONE",
|
| 827 |
+
"4": "FAIL",
|
| 828 |
}
|
| 829 |
+
doc = doc.to_dict()
|
| 830 |
renamed_doc = {}
|
| 831 |
for key, value in doc.items():
|
| 832 |
new_key = key_mapping.get(key, key)
|
|
|
|
| 839 |
for id in sres.ids:
|
| 840 |
d = {
|
| 841 |
"chunk_id": id,
|
| 842 |
+
"content_with_weight": (
|
| 843 |
+
rmSpace(sres.highlight[id])
|
| 844 |
+
if question and id in sres.highlight
|
| 845 |
+
else sres.field[id].get("content_with_weight", "")
|
| 846 |
+
),
|
| 847 |
"doc_id": sres.field[id]["doc_id"],
|
| 848 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 849 |
"important_kwd": sres.field[id].get("important_kwd", []),
|
| 850 |
"img_id": sres.field[id].get("img_id", ""),
|
| 851 |
"available_int": sres.field[id].get("available_int", 1),
|
| 852 |
+
"positions": sres.field[id].get("position_int", "").split("\t"),
|
| 853 |
}
|
| 854 |
if len(d["positions"]) % 5 == 0:
|
| 855 |
poss = []
|
| 856 |
for i in range(0, len(d["positions"]), 5):
|
| 857 |
+
poss.append(
|
| 858 |
+
[
|
| 859 |
+
float(d["positions"][i]),
|
| 860 |
+
float(d["positions"][i + 1]),
|
| 861 |
+
float(d["positions"][i + 2]),
|
| 862 |
+
float(d["positions"][i + 3]),
|
| 863 |
+
float(d["positions"][i + 4]),
|
| 864 |
+
]
|
| 865 |
+
)
|
| 866 |
d["positions"] = poss
|
| 867 |
|
| 868 |
origin_chunks.append(d)
|
|
|
|
| 882 |
"doc_id": "document_id",
|
| 883 |
"important_kwd": "important_keywords",
|
| 884 |
"img_id": "image_id",
|
| 885 |
+
"available_int": "available",
|
| 886 |
}
|
| 887 |
renamed_chunk = {}
|
| 888 |
for key, value in chunk.items():
|
|
|
|
| 896 |
return get_result(data=res)
|
| 897 |
|
| 898 |
|
| 899 |
+
@manager.route(
|
| 900 |
+
"/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["POST"]
|
| 901 |
+
)
|
| 902 |
@token_required
|
| 903 |
+
def add_chunk(tenant_id, dataset_id, document_id):
|
| 904 |
+
"""
|
| 905 |
+
Add a chunk to a document.
|
| 906 |
+
---
|
| 907 |
+
tags:
|
| 908 |
+
- Chunks
|
| 909 |
+
security:
|
| 910 |
+
- ApiKeyAuth: []
|
| 911 |
+
parameters:
|
| 912 |
+
- in: path
|
| 913 |
+
name: dataset_id
|
| 914 |
+
type: string
|
| 915 |
+
required: true
|
| 916 |
+
description: ID of the dataset.
|
| 917 |
+
- in: path
|
| 918 |
+
name: document_id
|
| 919 |
+
type: string
|
| 920 |
+
required: true
|
| 921 |
+
description: ID of the document.
|
| 922 |
+
- in: body
|
| 923 |
+
name: body
|
| 924 |
+
description: Chunk data.
|
| 925 |
+
required: true
|
| 926 |
+
schema:
|
| 927 |
+
type: object
|
| 928 |
+
properties:
|
| 929 |
+
content:
|
| 930 |
+
type: string
|
| 931 |
+
required: true
|
| 932 |
+
description: Content of the chunk.
|
| 933 |
+
important_keywords:
|
| 934 |
+
type: array
|
| 935 |
+
items:
|
| 936 |
+
type: string
|
| 937 |
+
description: Important keywords.
|
| 938 |
+
- in: header
|
| 939 |
+
name: Authorization
|
| 940 |
+
type: string
|
| 941 |
+
required: true
|
| 942 |
+
description: Bearer token for authentication.
|
| 943 |
+
responses:
|
| 944 |
+
200:
|
| 945 |
+
description: Chunk added successfully.
|
| 946 |
+
schema:
|
| 947 |
+
type: object
|
| 948 |
+
properties:
|
| 949 |
+
chunk:
|
| 950 |
+
type: object
|
| 951 |
+
properties:
|
| 952 |
+
id:
|
| 953 |
+
type: string
|
| 954 |
+
description: Chunk ID.
|
| 955 |
+
content:
|
| 956 |
+
type: string
|
| 957 |
+
description: Chunk content.
|
| 958 |
+
document_id:
|
| 959 |
+
type: string
|
| 960 |
+
description: ID of the document.
|
| 961 |
+
important_keywords:
|
| 962 |
+
type: array
|
| 963 |
+
items:
|
| 964 |
+
type: string
|
| 965 |
+
description: Important keywords.
|
| 966 |
+
"""
|
| 967 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 968 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 969 |
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 970 |
if not doc:
|
| 971 |
+
return get_error_data_result(
|
| 972 |
+
retmsg=f"You don't own the document {document_id}."
|
| 973 |
+
)
|
| 974 |
doc = doc[0]
|
| 975 |
req = request.json
|
| 976 |
if not req.get("content"):
|
| 977 |
return get_error_data_result(retmsg="`content` is required")
|
| 978 |
if "important_keywords" in req:
|
| 979 |
if type(req["important_keywords"]) != list:
|
| 980 |
+
return get_error_data_result(
|
| 981 |
+
"`important_keywords` is required to be a list"
|
| 982 |
+
)
|
| 983 |
md5 = hashlib.md5()
|
| 984 |
md5.update((req["content"] + document_id).encode("utf-8"))
|
| 985 |
|
| 986 |
chunk_id = md5.hexdigest()
|
| 987 |
+
d = {
|
| 988 |
+
"id": chunk_id,
|
| 989 |
+
"content_ltks": rag_tokenizer.tokenize(req["content"]),
|
| 990 |
+
"content_with_weight": req["content"],
|
| 991 |
+
}
|
| 992 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 993 |
d["important_kwd"] = req.get("important_keywords", [])
|
| 994 |
+
d["important_tks"] = rag_tokenizer.tokenize(
|
| 995 |
+
" ".join(req.get("important_keywords", []))
|
| 996 |
+
)
|
| 997 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
| 998 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
| 999 |
d["kb_id"] = [doc.kb_id]
|
|
|
|
| 1001 |
d["doc_id"] = doc.id
|
| 1002 |
embd_id = DocumentService.get_embd_id(document_id)
|
| 1003 |
embd_mdl = TenantLLMService.model_instance(
|
| 1004 |
+
tenant_id, LLMType.EMBEDDING.value, embd_id
|
| 1005 |
+
)
|
| 1006 |
+
print(embd_mdl, flush=True)
|
| 1007 |
v, c = embd_mdl.encode([doc.name, req["content"]])
|
| 1008 |
v = 0.1 * v[0] + 0.9 * v[1]
|
| 1009 |
d["q_%d_vec" % len(v)] = v.tolist()
|
| 1010 |
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
|
| 1011 |
|
| 1012 |
+
DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
|
|
|
|
| 1013 |
d["chunk_id"] = chunk_id
|
| 1014 |
+
d["kb_id"] = doc.kb_id
|
| 1015 |
# rename keys
|
| 1016 |
key_mapping = {
|
| 1017 |
"chunk_id": "id",
|
|
|
|
| 1021 |
"kb_id": "dataset_id",
|
| 1022 |
"create_timestamp_flt": "create_timestamp",
|
| 1023 |
"create_time": "create_time",
|
| 1024 |
+
"document_keyword": "document",
|
| 1025 |
}
|
| 1026 |
renamed_chunk = {}
|
| 1027 |
for key, value in d.items():
|
|
|
|
| 1032 |
# return get_result(data={"chunk_id": chunk_id})
|
| 1033 |
|
| 1034 |
|
| 1035 |
+
@manager.route(
|
| 1036 |
+
"datasets/<dataset_id>/documents/<document_id>/chunks", methods=["DELETE"]
|
| 1037 |
+
)
|
| 1038 |
@token_required
|
| 1039 |
+
def rm_chunk(tenant_id, dataset_id, document_id):
|
| 1040 |
+
"""
|
| 1041 |
+
Remove chunks from a document.
|
| 1042 |
+
---
|
| 1043 |
+
tags:
|
| 1044 |
+
- Chunks
|
| 1045 |
+
security:
|
| 1046 |
+
- ApiKeyAuth: []
|
| 1047 |
+
parameters:
|
| 1048 |
+
- in: path
|
| 1049 |
+
name: dataset_id
|
| 1050 |
+
type: string
|
| 1051 |
+
required: true
|
| 1052 |
+
description: ID of the dataset.
|
| 1053 |
+
- in: path
|
| 1054 |
+
name: document_id
|
| 1055 |
+
type: string
|
| 1056 |
+
required: true
|
| 1057 |
+
description: ID of the document.
|
| 1058 |
+
- in: body
|
| 1059 |
+
name: body
|
| 1060 |
+
description: Chunk removal parameters.
|
| 1061 |
+
required: true
|
| 1062 |
+
schema:
|
| 1063 |
+
type: object
|
| 1064 |
+
properties:
|
| 1065 |
+
chunk_ids:
|
| 1066 |
+
type: array
|
| 1067 |
+
items:
|
| 1068 |
+
type: string
|
| 1069 |
+
description: List of chunk IDs to remove.
|
| 1070 |
+
- in: header
|
| 1071 |
+
name: Authorization
|
| 1072 |
+
type: string
|
| 1073 |
+
required: true
|
| 1074 |
+
description: Bearer token for authentication.
|
| 1075 |
+
responses:
|
| 1076 |
+
200:
|
| 1077 |
+
description: Chunks removed successfully.
|
| 1078 |
+
schema:
|
| 1079 |
+
type: object
|
| 1080 |
+
"""
|
| 1081 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 1082 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 1083 |
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 1084 |
if not doc:
|
| 1085 |
+
return get_error_data_result(
|
| 1086 |
+
retmsg=f"You don't own the document {document_id}."
|
| 1087 |
+
)
|
| 1088 |
doc = doc[0]
|
| 1089 |
req = request.json
|
| 1090 |
+
if not req.get("chunk_ids"):
|
| 1091 |
+
return get_error_data_result("`chunk_ids` is required")
|
| 1092 |
+
query = {"doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
|
| 1093 |
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
| 1094 |
if not req:
|
| 1095 |
+
chunk_ids = None
|
| 1096 |
else:
|
| 1097 |
+
chunk_ids = req.get("chunk_ids")
|
| 1098 |
if not chunk_ids:
|
| 1099 |
+
chunk_list = sres.ids
|
| 1100 |
else:
|
| 1101 |
+
chunk_list = chunk_ids
|
| 1102 |
for chunk_id in chunk_list:
|
| 1103 |
if chunk_id not in sres.ids:
|
| 1104 |
return get_error_data_result(f"Chunk {chunk_id} not found")
|
| 1105 |
if not ELASTICSEARCH.deleteByQuery(
|
| 1106 |
+
Q("ids", values=chunk_list), search.index_name(tenant_id)
|
| 1107 |
+
):
|
| 1108 |
return get_error_data_result(retmsg="Index updating failure")
|
| 1109 |
deleted_chunk_ids = chunk_list
|
| 1110 |
chunk_number = len(deleted_chunk_ids)
|
|
|
|
| 1112 |
return get_result()
|
| 1113 |
|
| 1114 |
|
| 1115 |
+
@manager.route(
|
| 1116 |
+
"/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>", methods=["PUT"]
|
| 1117 |
+
)
|
| 1118 |
@token_required
|
| 1119 |
+
def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
| 1120 |
+
"""
|
| 1121 |
+
Update a chunk within a document.
|
| 1122 |
+
---
|
| 1123 |
+
tags:
|
| 1124 |
+
- Chunks
|
| 1125 |
+
security:
|
| 1126 |
+
- ApiKeyAuth: []
|
| 1127 |
+
parameters:
|
| 1128 |
+
- in: path
|
| 1129 |
+
name: dataset_id
|
| 1130 |
+
type: string
|
| 1131 |
+
required: true
|
| 1132 |
+
description: ID of the dataset.
|
| 1133 |
+
- in: path
|
| 1134 |
+
name: document_id
|
| 1135 |
+
type: string
|
| 1136 |
+
required: true
|
| 1137 |
+
description: ID of the document.
|
| 1138 |
+
- in: path
|
| 1139 |
+
name: chunk_id
|
| 1140 |
+
type: string
|
| 1141 |
+
required: true
|
| 1142 |
+
description: ID of the chunk to update.
|
| 1143 |
+
- in: body
|
| 1144 |
+
name: body
|
| 1145 |
+
description: Chunk update parameters.
|
| 1146 |
+
required: true
|
| 1147 |
+
schema:
|
| 1148 |
+
type: object
|
| 1149 |
+
properties:
|
| 1150 |
+
content:
|
| 1151 |
+
type: string
|
| 1152 |
+
description: Updated content of the chunk.
|
| 1153 |
+
important_keywords:
|
| 1154 |
+
type: array
|
| 1155 |
+
items:
|
| 1156 |
+
type: string
|
| 1157 |
+
description: Updated important keywords.
|
| 1158 |
+
available:
|
| 1159 |
+
type: boolean
|
| 1160 |
+
description: Availability status of the chunk.
|
| 1161 |
+
- in: header
|
| 1162 |
+
name: Authorization
|
| 1163 |
+
type: string
|
| 1164 |
+
required: true
|
| 1165 |
+
description: Bearer token for authentication.
|
| 1166 |
+
responses:
|
| 1167 |
+
200:
|
| 1168 |
+
description: Chunk updated successfully.
|
| 1169 |
+
schema:
|
| 1170 |
+
type: object
|
| 1171 |
+
"""
|
| 1172 |
try:
|
| 1173 |
+
res = ELASTICSEARCH.get(chunk_id, search.index_name(tenant_id))
|
|
|
|
|
|
|
| 1174 |
except Exception as e:
|
| 1175 |
return get_error_data_result(f"Can't find this chunk {chunk_id}")
|
| 1176 |
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
|
| 1177 |
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
|
| 1178 |
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
| 1179 |
if not doc:
|
| 1180 |
+
return get_error_data_result(
|
| 1181 |
+
retmsg=f"You don't own the document {document_id}."
|
| 1182 |
+
)
|
| 1183 |
doc = doc[0]
|
| 1184 |
query = {
|
| 1185 |
+
"doc_ids": [document_id],
|
| 1186 |
+
"page": 1,
|
| 1187 |
+
"size": 1024,
|
| 1188 |
+
"question": "",
|
| 1189 |
+
"sort": True,
|
| 1190 |
}
|
| 1191 |
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
| 1192 |
if chunk_id not in sres.ids:
|
| 1193 |
return get_error_data_result(f"You don't own the chunk {chunk_id}")
|
| 1194 |
req = request.json
|
| 1195 |
+
content = res["_source"].get("content_with_weight")
|
| 1196 |
+
d = {"id": chunk_id, "content_with_weight": req.get("content", content)}
|
|
|
|
|
|
|
| 1197 |
d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
|
| 1198 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
| 1199 |
if "important_keywords" in req:
|
| 1200 |
+
if not isinstance(req["important_keywords"], list):
|
| 1201 |
return get_error_data_result("`important_keywords` should be a list")
|
| 1202 |
d["important_kwd"] = req.get("important_keywords")
|
| 1203 |
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
|
|
|
|
| 1205 |
d["available_int"] = int(req["available"])
|
| 1206 |
embd_id = DocumentService.get_embd_id(document_id)
|
| 1207 |
embd_mdl = TenantLLMService.model_instance(
|
| 1208 |
+
tenant_id, LLMType.EMBEDDING.value, embd_id
|
| 1209 |
+
)
|
| 1210 |
if doc.parser_id == ParserType.QA:
|
| 1211 |
+
arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
|
|
|
|
|
|
|
|
|
|
| 1212 |
if len(arr) != 2:
|
| 1213 |
return get_error_data_result(
|
| 1214 |
+
retmsg="Q&A must be separated by TAB/ENTER key."
|
| 1215 |
+
)
|
| 1216 |
q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
|
| 1217 |
+
d = beAdoc(
|
| 1218 |
+
d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
|
| 1219 |
+
)
|
| 1220 |
|
| 1221 |
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
|
| 1222 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
|
|
|
| 1225 |
return get_result()
|
| 1226 |
|
| 1227 |
|
| 1228 |
+
@manager.route("/retrieval", methods=["POST"])
|
|
|
|
| 1229 |
@token_required
|
| 1230 |
def retrieval_test(tenant_id):
|
| 1231 |
+
"""
|
| 1232 |
+
Retrieve chunks based on a query.
|
| 1233 |
+
---
|
| 1234 |
+
tags:
|
| 1235 |
+
- Retrieval
|
| 1236 |
+
security:
|
| 1237 |
+
- ApiKeyAuth: []
|
| 1238 |
+
parameters:
|
| 1239 |
+
- in: body
|
| 1240 |
+
name: body
|
| 1241 |
+
description: Retrieval parameters.
|
| 1242 |
+
required: true
|
| 1243 |
+
schema:
|
| 1244 |
+
type: object
|
| 1245 |
+
properties:
|
| 1246 |
+
dataset_ids:
|
| 1247 |
+
type: array
|
| 1248 |
+
items:
|
| 1249 |
+
type: string
|
| 1250 |
+
required: true
|
| 1251 |
+
description: List of dataset IDs to search in.
|
| 1252 |
+
question:
|
| 1253 |
+
type: string
|
| 1254 |
+
required: true
|
| 1255 |
+
description: Query string.
|
| 1256 |
+
document_ids:
|
| 1257 |
+
type: array
|
| 1258 |
+
items:
|
| 1259 |
+
type: string
|
| 1260 |
+
description: List of document IDs to filter.
|
| 1261 |
+
similarity_threshold:
|
| 1262 |
+
type: number
|
| 1263 |
+
format: float
|
| 1264 |
+
description: Similarity threshold.
|
| 1265 |
+
vector_similarity_weight:
|
| 1266 |
+
type: number
|
| 1267 |
+
format: float
|
| 1268 |
+
description: Vector similarity weight.
|
| 1269 |
+
top_k:
|
| 1270 |
+
type: integer
|
| 1271 |
+
description: Maximum number of chunks to return.
|
| 1272 |
+
highlight:
|
| 1273 |
+
type: boolean
|
| 1274 |
+
description: Whether to highlight matched content.
|
| 1275 |
+
- in: header
|
| 1276 |
+
name: Authorization
|
| 1277 |
+
type: string
|
| 1278 |
+
required: true
|
| 1279 |
+
description: Bearer token for authentication.
|
| 1280 |
+
responses:
|
| 1281 |
+
200:
|
| 1282 |
+
description: Retrieval results.
|
| 1283 |
+
schema:
|
| 1284 |
+
type: object
|
| 1285 |
+
properties:
|
| 1286 |
+
chunks:
|
| 1287 |
+
type: array
|
| 1288 |
+
items:
|
| 1289 |
+
type: object
|
| 1290 |
+
properties:
|
| 1291 |
+
id:
|
| 1292 |
+
type: string
|
| 1293 |
+
description: Chunk ID.
|
| 1294 |
+
content:
|
| 1295 |
+
type: string
|
| 1296 |
+
description: Chunk content.
|
| 1297 |
+
document_id:
|
| 1298 |
+
type: string
|
| 1299 |
+
description: ID of the document.
|
| 1300 |
+
dataset_id:
|
| 1301 |
+
type: string
|
| 1302 |
+
description: ID of the dataset.
|
| 1303 |
+
similarity:
|
| 1304 |
+
type: number
|
| 1305 |
+
format: float
|
| 1306 |
+
description: Similarity score.
|
| 1307 |
+
"""
|
| 1308 |
req = request.json
|
| 1309 |
if not req.get("dataset_ids"):
|
| 1310 |
return get_error_data_result("`dataset_ids` is required.")
|
| 1311 |
kb_ids = req["dataset_ids"]
|
| 1312 |
+
if not isinstance(kb_ids, list):
|
| 1313 |
return get_error_data_result("`dataset_ids` should be a list")
|
| 1314 |
kbs = KnowledgebaseService.get_by_ids(kb_ids)
|
| 1315 |
for id in kb_ids:
|
| 1316 |
+
if not KnowledgebaseService.query(id=id, tenant_id=tenant_id):
|
| 1317 |
return get_error_data_result(f"You don't own the dataset {id}.")
|
| 1318 |
embd_nms = list(set([kb.embd_id for kb in kbs]))
|
| 1319 |
if len(embd_nms) != 1:
|
| 1320 |
return get_result(
|
| 1321 |
retmsg='Datasets use different embedding models."',
|
| 1322 |
+
retcode=RetCode.AUTHENTICATION_ERROR,
|
| 1323 |
+
)
|
| 1324 |
if "question" not in req:
|
| 1325 |
return get_error_data_result("`question` is required.")
|
| 1326 |
page = int(req.get("offset", 1))
|
| 1327 |
size = int(req.get("limit", 1024))
|
| 1328 |
question = req["question"]
|
| 1329 |
doc_ids = req.get("document_ids", [])
|
| 1330 |
+
if not isinstance(doc_ids, list):
|
| 1331 |
return get_error_data_result("`documents` should be a list")
|
| 1332 |
+
doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
|
| 1333 |
for doc_id in doc_ids:
|
| 1334 |
if doc_id not in doc_ids_list:
|
| 1335 |
+
return get_error_data_result(
|
| 1336 |
+
f"The datasets don't own the document {doc_id}"
|
| 1337 |
+
)
|
| 1338 |
similarity_threshold = float(req.get("similarity_threshold", 0.2))
|
| 1339 |
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
|
| 1340 |
top = int(req.get("top_k", 1024))
|
| 1341 |
+
if req.get("highlight") == "False" or req.get("highlight") == "false":
|
| 1342 |
highlight = False
|
| 1343 |
else:
|
| 1344 |
highlight = True
|
|
|
|
| 1347 |
if not e:
|
| 1348 |
return get_error_data_result(retmsg="Dataset not found!")
|
| 1349 |
embd_mdl = TenantLLMService.model_instance(
|
| 1350 |
+
kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id
|
| 1351 |
+
)
|
| 1352 |
|
| 1353 |
rerank_mdl = None
|
| 1354 |
if req.get("rerank_id"):
|
| 1355 |
rerank_mdl = TenantLLMService.model_instance(
|
| 1356 |
+
kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]
|
| 1357 |
+
)
|
| 1358 |
|
| 1359 |
if req.get("keyword", False):
|
| 1360 |
chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
|
| 1361 |
question += keyword_extraction(chat_mdl, question)
|
| 1362 |
|
| 1363 |
retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
|
| 1364 |
+
ranks = retr.retrieval(
|
| 1365 |
+
question,
|
| 1366 |
+
embd_mdl,
|
| 1367 |
+
kb.tenant_id,
|
| 1368 |
+
kb_ids,
|
| 1369 |
+
page,
|
| 1370 |
+
size,
|
| 1371 |
+
similarity_threshold,
|
| 1372 |
+
vector_similarity_weight,
|
| 1373 |
+
top,
|
| 1374 |
+
doc_ids,
|
| 1375 |
+
rerank_mdl=rerank_mdl,
|
| 1376 |
+
highlight=highlight,
|
| 1377 |
+
)
|
| 1378 |
for c in ranks["chunks"]:
|
| 1379 |
if "vector" in c:
|
| 1380 |
del c["vector"]
|
|
|
|
| 1387 |
"content_with_weight": "content",
|
| 1388 |
"doc_id": "document_id",
|
| 1389 |
"important_kwd": "important_keywords",
|
| 1390 |
+
"docnm_kwd": "document_keyword",
|
| 1391 |
}
|
| 1392 |
rename_chunk = {}
|
| 1393 |
for key, value in chunk.items():
|
|
|
|
| 1398 |
return get_result(data=ranks)
|
| 1399 |
except Exception as e:
|
| 1400 |
if str(e).find("not_found") > 0:
|
| 1401 |
+
return get_result(
|
| 1402 |
+
retmsg=f"No chunk found! Check the chunk status please!",
|
| 1403 |
+
retcode=RetCode.DATA_ERROR,
|
| 1404 |
+
)
|
| 1405 |
+
return server_error_response(e)
|
api/apps/system_app.py
CHANGED
|
@@ -24,8 +24,14 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
| 24 |
from api.db.services.user_service import UserTenantService
|
| 25 |
from api.settings import DATABASE_TYPE
|
| 26 |
from api.utils import current_timestamp, datetime_format
|
| 27 |
-
from api.utils.api_utils import
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
from api.versions import get_rag_version
|
| 30 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 31 |
from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE
|
|
@@ -34,44 +40,121 @@ from timeit import default_timer as timer
|
|
| 34 |
from rag.utils.redis_conn import REDIS_CONN
|
| 35 |
|
| 36 |
|
| 37 |
-
@manager.route(
|
| 38 |
@login_required
|
| 39 |
def version():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
return get_json_result(data=get_rag_version())
|
| 41 |
|
| 42 |
|
| 43 |
-
@manager.route(
|
| 44 |
@login_required
|
| 45 |
def status():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
res = {}
|
| 47 |
st = timer()
|
| 48 |
try:
|
| 49 |
res["es"] = ELASTICSEARCH.health()
|
| 50 |
-
res["es"]["elapsed"] = "{:.1f}".format((timer() - st)*1000.)
|
| 51 |
except Exception as e:
|
| 52 |
-
res["es"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
st = timer()
|
| 55 |
try:
|
| 56 |
STORAGE_IMPL.health()
|
| 57 |
-
res["storage"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
except Exception as e:
|
| 59 |
-
res["storage"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
st = timer()
|
| 62 |
try:
|
| 63 |
KnowledgebaseService.get_by_id("x")
|
| 64 |
-
res["database"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
except Exception as e:
|
| 66 |
-
res["database"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
st = timer()
|
| 69 |
try:
|
| 70 |
if not REDIS_CONN.health():
|
| 71 |
raise Exception("Lost connection!")
|
| 72 |
-
res["redis"] = {
|
|
|
|
|
|
|
|
|
|
| 73 |
except Exception as e:
|
| 74 |
-
res["redis"] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
try:
|
| 77 |
v = REDIS_CONN.get("TASKEXE")
|
|
@@ -84,10 +167,12 @@ def status():
|
|
| 84 |
if len(arr) == 1:
|
| 85 |
obj[id] = [0]
|
| 86 |
else:
|
| 87 |
-
obj[id] = [arr[i+1]-arr[i] for i in range(len(arr)-1)]
|
| 88 |
elapsed = max(obj[id])
|
| 89 |
-
if elapsed > 50:
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
res["task_executor"] = {"status": color, "elapsed": obj}
|
| 92 |
except Exception as e:
|
| 93 |
res["task_executor"] = {"status": "red", "error": str(e)}
|
|
@@ -95,21 +180,46 @@ def status():
|
|
| 95 |
return get_json_result(data=res)
|
| 96 |
|
| 97 |
|
| 98 |
-
@manager.route(
|
| 99 |
@login_required
|
| 100 |
def new_token():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
try:
|
| 102 |
tenants = UserTenantService.query(user_id=current_user.id)
|
| 103 |
if not tenants:
|
| 104 |
return get_data_error_result(retmsg="Tenant not found!")
|
| 105 |
|
| 106 |
tenant_id = tenants[0].tenant_id
|
| 107 |
-
obj = {
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
| 113 |
|
| 114 |
if not APITokenService.save(**obj):
|
| 115 |
return get_data_error_result(retmsg="Fail to new a dialog!")
|
|
@@ -119,9 +229,37 @@ def new_token():
|
|
| 119 |
return server_error_response(e)
|
| 120 |
|
| 121 |
|
| 122 |
-
@manager.route(
|
| 123 |
@login_required
|
| 124 |
def token_list():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
try:
|
| 126 |
tenants = UserTenantService.query(user_id=current_user.id)
|
| 127 |
if not tenants:
|
|
@@ -133,9 +271,33 @@ def token_list():
|
|
| 133 |
return server_error_response(e)
|
| 134 |
|
| 135 |
|
| 136 |
-
@manager.route(
|
| 137 |
@login_required
|
| 138 |
def rm(token):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
APITokenService.filter_delete(
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
| 24 |
from api.db.services.user_service import UserTenantService
|
| 25 |
from api.settings import DATABASE_TYPE
|
| 26 |
from api.utils import current_timestamp, datetime_format
|
| 27 |
+
from api.utils.api_utils import (
|
| 28 |
+
get_json_result,
|
| 29 |
+
get_data_error_result,
|
| 30 |
+
server_error_response,
|
| 31 |
+
generate_confirmation_token,
|
| 32 |
+
request,
|
| 33 |
+
validate_request,
|
| 34 |
+
)
|
| 35 |
from api.versions import get_rag_version
|
| 36 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 37 |
from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE
|
|
|
|
| 40 |
from rag.utils.redis_conn import REDIS_CONN
|
| 41 |
|
| 42 |
|
| 43 |
+
@manager.route("/version", methods=["GET"])
|
| 44 |
@login_required
|
| 45 |
def version():
|
| 46 |
+
"""
|
| 47 |
+
Get the current version of the application.
|
| 48 |
+
---
|
| 49 |
+
tags:
|
| 50 |
+
- System
|
| 51 |
+
security:
|
| 52 |
+
- ApiKeyAuth: []
|
| 53 |
+
responses:
|
| 54 |
+
200:
|
| 55 |
+
description: Version retrieved successfully.
|
| 56 |
+
schema:
|
| 57 |
+
type: object
|
| 58 |
+
properties:
|
| 59 |
+
version:
|
| 60 |
+
type: string
|
| 61 |
+
description: Version number.
|
| 62 |
+
"""
|
| 63 |
return get_json_result(data=get_rag_version())
|
| 64 |
|
| 65 |
|
| 66 |
+
@manager.route("/status", methods=["GET"])
|
| 67 |
@login_required
|
| 68 |
def status():
|
| 69 |
+
"""
|
| 70 |
+
Get the system status.
|
| 71 |
+
---
|
| 72 |
+
tags:
|
| 73 |
+
- System
|
| 74 |
+
security:
|
| 75 |
+
- ApiKeyAuth: []
|
| 76 |
+
responses:
|
| 77 |
+
200:
|
| 78 |
+
description: System is operational.
|
| 79 |
+
schema:
|
| 80 |
+
type: object
|
| 81 |
+
properties:
|
| 82 |
+
es:
|
| 83 |
+
type: object
|
| 84 |
+
description: Elasticsearch status.
|
| 85 |
+
storage:
|
| 86 |
+
type: object
|
| 87 |
+
description: Storage status.
|
| 88 |
+
database:
|
| 89 |
+
type: object
|
| 90 |
+
description: Database status.
|
| 91 |
+
503:
|
| 92 |
+
description: Service unavailable.
|
| 93 |
+
schema:
|
| 94 |
+
type: object
|
| 95 |
+
properties:
|
| 96 |
+
error:
|
| 97 |
+
type: string
|
| 98 |
+
description: Error message.
|
| 99 |
+
"""
|
| 100 |
res = {}
|
| 101 |
st = timer()
|
| 102 |
try:
|
| 103 |
res["es"] = ELASTICSEARCH.health()
|
| 104 |
+
res["es"]["elapsed"] = "{:.1f}".format((timer() - st) * 1000.0)
|
| 105 |
except Exception as e:
|
| 106 |
+
res["es"] = {
|
| 107 |
+
"status": "red",
|
| 108 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 109 |
+
"error": str(e),
|
| 110 |
+
}
|
| 111 |
|
| 112 |
st = timer()
|
| 113 |
try:
|
| 114 |
STORAGE_IMPL.health()
|
| 115 |
+
res["storage"] = {
|
| 116 |
+
"storage": STORAGE_IMPL_TYPE.lower(),
|
| 117 |
+
"status": "green",
|
| 118 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 119 |
+
}
|
| 120 |
except Exception as e:
|
| 121 |
+
res["storage"] = {
|
| 122 |
+
"storage": STORAGE_IMPL_TYPE.lower(),
|
| 123 |
+
"status": "red",
|
| 124 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 125 |
+
"error": str(e),
|
| 126 |
+
}
|
| 127 |
|
| 128 |
st = timer()
|
| 129 |
try:
|
| 130 |
KnowledgebaseService.get_by_id("x")
|
| 131 |
+
res["database"] = {
|
| 132 |
+
"database": DATABASE_TYPE.lower(),
|
| 133 |
+
"status": "green",
|
| 134 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 135 |
+
}
|
| 136 |
except Exception as e:
|
| 137 |
+
res["database"] = {
|
| 138 |
+
"database": DATABASE_TYPE.lower(),
|
| 139 |
+
"status": "red",
|
| 140 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 141 |
+
"error": str(e),
|
| 142 |
+
}
|
| 143 |
|
| 144 |
st = timer()
|
| 145 |
try:
|
| 146 |
if not REDIS_CONN.health():
|
| 147 |
raise Exception("Lost connection!")
|
| 148 |
+
res["redis"] = {
|
| 149 |
+
"status": "green",
|
| 150 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 151 |
+
}
|
| 152 |
except Exception as e:
|
| 153 |
+
res["redis"] = {
|
| 154 |
+
"status": "red",
|
| 155 |
+
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
| 156 |
+
"error": str(e),
|
| 157 |
+
}
|
| 158 |
|
| 159 |
try:
|
| 160 |
v = REDIS_CONN.get("TASKEXE")
|
|
|
|
| 167 |
if len(arr) == 1:
|
| 168 |
obj[id] = [0]
|
| 169 |
else:
|
| 170 |
+
obj[id] = [arr[i + 1] - arr[i] for i in range(len(arr) - 1)]
|
| 171 |
elapsed = max(obj[id])
|
| 172 |
+
if elapsed > 50:
|
| 173 |
+
color = "yellow"
|
| 174 |
+
if elapsed > 120:
|
| 175 |
+
color = "red"
|
| 176 |
res["task_executor"] = {"status": color, "elapsed": obj}
|
| 177 |
except Exception as e:
|
| 178 |
res["task_executor"] = {"status": "red", "error": str(e)}
|
|
|
|
| 180 |
return get_json_result(data=res)
|
| 181 |
|
| 182 |
|
| 183 |
+
@manager.route("/new_token", methods=["POST"])
|
| 184 |
@login_required
|
| 185 |
def new_token():
|
| 186 |
+
"""
|
| 187 |
+
Generate a new API token.
|
| 188 |
+
---
|
| 189 |
+
tags:
|
| 190 |
+
- API Tokens
|
| 191 |
+
security:
|
| 192 |
+
- ApiKeyAuth: []
|
| 193 |
+
parameters:
|
| 194 |
+
- in: query
|
| 195 |
+
name: name
|
| 196 |
+
type: string
|
| 197 |
+
required: false
|
| 198 |
+
description: Name of the token.
|
| 199 |
+
responses:
|
| 200 |
+
200:
|
| 201 |
+
description: Token generated successfully.
|
| 202 |
+
schema:
|
| 203 |
+
type: object
|
| 204 |
+
properties:
|
| 205 |
+
token:
|
| 206 |
+
type: string
|
| 207 |
+
description: The generated API token.
|
| 208 |
+
"""
|
| 209 |
try:
|
| 210 |
tenants = UserTenantService.query(user_id=current_user.id)
|
| 211 |
if not tenants:
|
| 212 |
return get_data_error_result(retmsg="Tenant not found!")
|
| 213 |
|
| 214 |
tenant_id = tenants[0].tenant_id
|
| 215 |
+
obj = {
|
| 216 |
+
"tenant_id": tenant_id,
|
| 217 |
+
"token": generate_confirmation_token(tenant_id),
|
| 218 |
+
"create_time": current_timestamp(),
|
| 219 |
+
"create_date": datetime_format(datetime.now()),
|
| 220 |
+
"update_time": None,
|
| 221 |
+
"update_date": None,
|
| 222 |
+
}
|
| 223 |
|
| 224 |
if not APITokenService.save(**obj):
|
| 225 |
return get_data_error_result(retmsg="Fail to new a dialog!")
|
|
|
|
| 229 |
return server_error_response(e)
|
| 230 |
|
| 231 |
|
| 232 |
+
@manager.route("/token_list", methods=["GET"])
|
| 233 |
@login_required
|
| 234 |
def token_list():
|
| 235 |
+
"""
|
| 236 |
+
List all API tokens for the current user.
|
| 237 |
+
---
|
| 238 |
+
tags:
|
| 239 |
+
- API Tokens
|
| 240 |
+
security:
|
| 241 |
+
- ApiKeyAuth: []
|
| 242 |
+
responses:
|
| 243 |
+
200:
|
| 244 |
+
description: List of API tokens.
|
| 245 |
+
schema:
|
| 246 |
+
type: object
|
| 247 |
+
properties:
|
| 248 |
+
tokens:
|
| 249 |
+
type: array
|
| 250 |
+
items:
|
| 251 |
+
type: object
|
| 252 |
+
properties:
|
| 253 |
+
token:
|
| 254 |
+
type: string
|
| 255 |
+
description: The API token.
|
| 256 |
+
name:
|
| 257 |
+
type: string
|
| 258 |
+
description: Name of the token.
|
| 259 |
+
create_time:
|
| 260 |
+
type: string
|
| 261 |
+
description: Token creation time.
|
| 262 |
+
"""
|
| 263 |
try:
|
| 264 |
tenants = UserTenantService.query(user_id=current_user.id)
|
| 265 |
if not tenants:
|
|
|
|
| 271 |
return server_error_response(e)
|
| 272 |
|
| 273 |
|
| 274 |
+
@manager.route("/token/<token>", methods=["DELETE"])
|
| 275 |
@login_required
|
| 276 |
def rm(token):
|
| 277 |
+
"""
|
| 278 |
+
Remove an API token.
|
| 279 |
+
---
|
| 280 |
+
tags:
|
| 281 |
+
- API Tokens
|
| 282 |
+
security:
|
| 283 |
+
- ApiKeyAuth: []
|
| 284 |
+
parameters:
|
| 285 |
+
- in: path
|
| 286 |
+
name: token
|
| 287 |
+
type: string
|
| 288 |
+
required: true
|
| 289 |
+
description: The API token to remove.
|
| 290 |
+
responses:
|
| 291 |
+
200:
|
| 292 |
+
description: Token removed successfully.
|
| 293 |
+
schema:
|
| 294 |
+
type: object
|
| 295 |
+
properties:
|
| 296 |
+
success:
|
| 297 |
+
type: boolean
|
| 298 |
+
description: Deletion status.
|
| 299 |
+
"""
|
| 300 |
APITokenService.filter_delete(
|
| 301 |
+
[APIToken.tenant_id == current_user.id, APIToken.token == token]
|
| 302 |
+
)
|
| 303 |
+
return get_json_result(data=True)
|
api/apps/user_app.py
CHANGED
|
@@ -23,65 +23,141 @@ from flask_login import login_required, current_user, login_user, logout_user
|
|
| 23 |
|
| 24 |
from api.db.db_models import TenantLLM
|
| 25 |
from api.db.services.llm_service import TenantLLMService, LLMService
|
| 26 |
-
from api.utils.api_utils import
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from api.db import UserTenantRole, LLMType, FileType
|
| 29 |
-
from api.settings import
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
| 33 |
from api.db.services.file_service import FileService
|
| 34 |
from api.settings import stat_logger
|
| 35 |
from api.utils.api_utils import get_json_result, construct_response
|
| 36 |
|
| 37 |
|
| 38 |
-
@manager.route(
|
| 39 |
def login():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
if not request.json:
|
| 41 |
-
return get_json_result(
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
-
email = request.json.get(
|
| 46 |
users = UserService.query(email=email)
|
| 47 |
if not users:
|
| 48 |
-
return get_json_result(
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
password = request.json.get(
|
| 53 |
try:
|
| 54 |
password = decrypt(password)
|
| 55 |
except BaseException:
|
| 56 |
-
return get_json_result(
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
user = UserService.query_user(email, password)
|
| 61 |
if user:
|
| 62 |
response_data = user.to_json()
|
| 63 |
user.access_token = get_uuid()
|
| 64 |
login_user(user)
|
| 65 |
-
user.update_time = current_timestamp(),
|
| 66 |
-
user.update_date = datetime_format(datetime.now()),
|
| 67 |
user.save()
|
| 68 |
msg = "Welcome back!"
|
| 69 |
return construct_response(data=response_data, auth=user.get_id(), retmsg=msg)
|
| 70 |
else:
|
| 71 |
-
return get_json_result(
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
-
@manager.route(
|
| 77 |
def github_callback():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
import requests
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
res = res.json()
|
| 86 |
if "error" in res:
|
| 87 |
return redirect("/?error=%s" % res["error_description"])
|
|
@@ -103,19 +179,22 @@ def github_callback():
|
|
| 103 |
except Exception as e:
|
| 104 |
stat_logger.exception(e)
|
| 105 |
avatar = ""
|
| 106 |
-
users = user_register(
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
| 115 |
if not users:
|
| 116 |
-
raise Exception(f
|
| 117 |
if len(users) > 1:
|
| 118 |
-
raise Exception(f
|
| 119 |
|
| 120 |
# Try to log in
|
| 121 |
user = users[0]
|
|
@@ -134,30 +213,56 @@ def github_callback():
|
|
| 134 |
return redirect("/?auth=%s" % user.get_id())
|
| 135 |
|
| 136 |
|
| 137 |
-
@manager.route(
|
| 138 |
def feishu_callback():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
import requests
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
app_access_token_res = app_access_token_res.json()
|
| 147 |
-
if app_access_token_res[
|
| 148 |
return redirect("/?error=%s" % app_access_token_res)
|
| 149 |
|
| 150 |
-
res = requests.post(
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
res = res.json()
|
| 160 |
-
if res[
|
| 161 |
return redirect("/?error=%s" % res["message"])
|
| 162 |
|
| 163 |
if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
|
|
@@ -176,19 +281,22 @@ def feishu_callback():
|
|
| 176 |
except Exception as e:
|
| 177 |
stat_logger.exception(e)
|
| 178 |
avatar = ""
|
| 179 |
-
users = user_register(
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
| 188 |
if not users:
|
| 189 |
-
raise Exception(f
|
| 190 |
if len(users) > 1:
|
| 191 |
-
raise Exception(f
|
| 192 |
|
| 193 |
# Try to log in
|
| 194 |
user = users[0]
|
|
@@ -209,11 +317,14 @@ def feishu_callback():
|
|
| 209 |
|
| 210 |
def user_info_from_feishu(access_token):
|
| 211 |
import requests
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
| 214 |
res = requests.get(
|
| 215 |
-
f"https://open.feishu.cn/open-apis/authen/v1/user_info",
|
| 216 |
-
|
| 217 |
user_info = res.json()["data"]
|
| 218 |
user_info["email"] = None if user_info.get("email") == "" else user_info["email"]
|
| 219 |
return user_info
|
|
@@ -221,24 +332,38 @@ def user_info_from_feishu(access_token):
|
|
| 221 |
|
| 222 |
def user_info_from_github(access_token):
|
| 223 |
import requests
|
| 224 |
-
|
| 225 |
-
|
| 226 |
res = requests.get(
|
| 227 |
-
f"https://api.github.com/user?access_token={access_token}",
|
| 228 |
-
|
| 229 |
user_info = res.json()
|
| 230 |
email_info = requests.get(
|
| 231 |
f"https://api.github.com/user/emails?access_token={access_token}",
|
| 232 |
-
headers=headers
|
|
|
|
| 233 |
user_info["email"] = next(
|
| 234 |
-
(email for email in email_info if email[
|
| 235 |
-
|
| 236 |
return user_info
|
| 237 |
|
| 238 |
|
| 239 |
-
@manager.route("/logout", methods=[
|
| 240 |
@login_required
|
| 241 |
def log_out():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
current_user.access_token = ""
|
| 243 |
current_user.save()
|
| 244 |
logout_user()
|
|
@@ -248,20 +373,62 @@ def log_out():
|
|
| 248 |
@manager.route("/setting", methods=["POST"])
|
| 249 |
@login_required
|
| 250 |
def setting_user():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
update_dict = {}
|
| 252 |
request_data = request.json
|
| 253 |
if request_data.get("password"):
|
| 254 |
new_password = request_data.get("new_password")
|
| 255 |
if not check_password_hash(
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
if new_password:
|
| 260 |
update_dict["password"] = generate_password_hash(decrypt(new_password))
|
| 261 |
|
| 262 |
for k in request_data.keys():
|
| 263 |
-
if k in [
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
continue
|
| 266 |
update_dict[k] = request_data[k]
|
| 267 |
|
|
@@ -270,12 +437,37 @@ def setting_user():
|
|
| 270 |
return get_json_result(data=True)
|
| 271 |
except Exception as e:
|
| 272 |
stat_logger.exception(e)
|
| 273 |
-
return get_json_result(
|
|
|
|
|
|
|
| 274 |
|
| 275 |
|
| 276 |
@manager.route("/info", methods=["GET"])
|
| 277 |
@login_required
|
| 278 |
def user_profile():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
return get_json_result(data=current_user.to_dict())
|
| 280 |
|
| 281 |
|
|
@@ -310,13 +502,13 @@ def user_register(user_id, user):
|
|
| 310 |
"asr_id": ASR_MDL,
|
| 311 |
"parser_ids": PARSERS,
|
| 312 |
"img2txt_id": IMAGE2TEXT_MDL,
|
| 313 |
-
"rerank_id": RERANK_MDL
|
| 314 |
}
|
| 315 |
usr_tenant = {
|
| 316 |
"tenant_id": user_id,
|
| 317 |
"user_id": user_id,
|
| 318 |
"invited_by": user_id,
|
| 319 |
-
"role": UserTenantRole.OWNER
|
| 320 |
}
|
| 321 |
file_id = get_uuid()
|
| 322 |
file = {
|
|
@@ -331,13 +523,16 @@ def user_register(user_id, user):
|
|
| 331 |
}
|
| 332 |
tenant_llm = []
|
| 333 |
for llm in LLMService.query(fid=LLM_FACTORY):
|
| 334 |
-
tenant_llm.append(
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
if not UserService.save(**user):
|
| 343 |
return
|
|
@@ -351,21 +546,52 @@ def user_register(user_id, user):
|
|
| 351 |
@manager.route("/register", methods=["POST"])
|
| 352 |
@validate_request("nickname", "email", "password")
|
| 353 |
def user_add():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
req = request.json
|
| 355 |
email_address = req["email"]
|
| 356 |
|
| 357 |
# Validate the email address
|
| 358 |
if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,5}$", email_address):
|
| 359 |
-
return get_json_result(
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
| 362 |
|
| 363 |
# Check if the email address is already used
|
| 364 |
if UserService.query(email=email_address):
|
| 365 |
return get_json_result(
|
| 366 |
data=False,
|
| 367 |
-
retmsg=f
|
| 368 |
-
retcode=RetCode.OPERATING_ERROR
|
|
|
|
| 369 |
|
| 370 |
# Construct user info data
|
| 371 |
nickname = req["nickname"]
|
|
@@ -383,25 +609,55 @@ def user_add():
|
|
| 383 |
try:
|
| 384 |
users = user_register(user_id, user_dict)
|
| 385 |
if not users:
|
| 386 |
-
raise Exception(f
|
| 387 |
if len(users) > 1:
|
| 388 |
-
raise Exception(f
|
| 389 |
user = users[0]
|
| 390 |
login_user(user)
|
| 391 |
-
return construct_response(
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
| 394 |
except Exception as e:
|
| 395 |
rollback_user_registration(user_id)
|
| 396 |
stat_logger.exception(e)
|
| 397 |
-
return get_json_result(
|
| 398 |
-
|
| 399 |
-
|
|
|
|
|
|
|
| 400 |
|
| 401 |
|
| 402 |
@manager.route("/tenant_info", methods=["GET"])
|
| 403 |
@login_required
|
| 404 |
def tenant_info():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
try:
|
| 406 |
tenants = TenantService.get_info_by(current_user.id)
|
| 407 |
if not tenants:
|
|
@@ -415,6 +671,42 @@ def tenant_info():
|
|
| 415 |
@login_required
|
| 416 |
@validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id")
|
| 417 |
def set_tenant_info():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
req = request.json
|
| 419 |
try:
|
| 420 |
tid = req["tenant_id"]
|
|
|
|
| 23 |
|
| 24 |
from api.db.db_models import TenantLLM
|
| 25 |
from api.db.services.llm_service import TenantLLMService, LLMService
|
| 26 |
+
from api.utils.api_utils import (
|
| 27 |
+
server_error_response,
|
| 28 |
+
validate_request,
|
| 29 |
+
get_data_error_result,
|
| 30 |
+
)
|
| 31 |
+
from api.utils import (
|
| 32 |
+
get_uuid,
|
| 33 |
+
get_format_time,
|
| 34 |
+
decrypt,
|
| 35 |
+
download_img,
|
| 36 |
+
current_timestamp,
|
| 37 |
+
datetime_format,
|
| 38 |
+
)
|
| 39 |
from api.db import UserTenantRole, LLMType, FileType
|
| 40 |
+
from api.settings import (
|
| 41 |
+
RetCode,
|
| 42 |
+
GITHUB_OAUTH,
|
| 43 |
+
FEISHU_OAUTH,
|
| 44 |
+
CHAT_MDL,
|
| 45 |
+
EMBEDDING_MDL,
|
| 46 |
+
ASR_MDL,
|
| 47 |
+
IMAGE2TEXT_MDL,
|
| 48 |
+
PARSERS,
|
| 49 |
+
API_KEY,
|
| 50 |
+
LLM_FACTORY,
|
| 51 |
+
LLM_BASE_URL,
|
| 52 |
+
RERANK_MDL,
|
| 53 |
+
)
|
| 54 |
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
| 55 |
from api.db.services.file_service import FileService
|
| 56 |
from api.settings import stat_logger
|
| 57 |
from api.utils.api_utils import get_json_result, construct_response
|
| 58 |
|
| 59 |
|
| 60 |
+
@manager.route("/login", methods=["POST", "GET"])
|
| 61 |
def login():
|
| 62 |
+
"""
|
| 63 |
+
User login endpoint.
|
| 64 |
+
---
|
| 65 |
+
tags:
|
| 66 |
+
- User
|
| 67 |
+
parameters:
|
| 68 |
+
- in: body
|
| 69 |
+
name: body
|
| 70 |
+
description: Login credentials.
|
| 71 |
+
required: true
|
| 72 |
+
schema:
|
| 73 |
+
type: object
|
| 74 |
+
properties:
|
| 75 |
+
email:
|
| 76 |
+
type: string
|
| 77 |
+
description: User email.
|
| 78 |
+
password:
|
| 79 |
+
type: string
|
| 80 |
+
description: User password.
|
| 81 |
+
responses:
|
| 82 |
+
200:
|
| 83 |
+
description: Login successful.
|
| 84 |
+
schema:
|
| 85 |
+
type: object
|
| 86 |
+
401:
|
| 87 |
+
description: Authentication failed.
|
| 88 |
+
schema:
|
| 89 |
+
type: object
|
| 90 |
+
"""
|
| 91 |
if not request.json:
|
| 92 |
+
return get_json_result(
|
| 93 |
+
data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg="Unauthorized!"
|
| 94 |
+
)
|
| 95 |
|
| 96 |
+
email = request.json.get("email", "")
|
| 97 |
users = UserService.query(email=email)
|
| 98 |
if not users:
|
| 99 |
+
return get_json_result(
|
| 100 |
+
data=False,
|
| 101 |
+
retcode=RetCode.AUTHENTICATION_ERROR,
|
| 102 |
+
retmsg=f"Email: {email} is not registered!",
|
| 103 |
+
)
|
| 104 |
|
| 105 |
+
password = request.json.get("password")
|
| 106 |
try:
|
| 107 |
password = decrypt(password)
|
| 108 |
except BaseException:
|
| 109 |
+
return get_json_result(
|
| 110 |
+
data=False, retcode=RetCode.SERVER_ERROR, retmsg="Fail to crypt password"
|
| 111 |
+
)
|
| 112 |
|
| 113 |
user = UserService.query_user(email, password)
|
| 114 |
if user:
|
| 115 |
response_data = user.to_json()
|
| 116 |
user.access_token = get_uuid()
|
| 117 |
login_user(user)
|
| 118 |
+
user.update_time = (current_timestamp(),)
|
| 119 |
+
user.update_date = (datetime_format(datetime.now()),)
|
| 120 |
user.save()
|
| 121 |
msg = "Welcome back!"
|
| 122 |
return construct_response(data=response_data, auth=user.get_id(), retmsg=msg)
|
| 123 |
else:
|
| 124 |
+
return get_json_result(
|
| 125 |
+
data=False,
|
| 126 |
+
retcode=RetCode.AUTHENTICATION_ERROR,
|
| 127 |
+
retmsg="Email and password do not match!",
|
| 128 |
+
)
|
| 129 |
|
| 130 |
|
| 131 |
+
@manager.route("/github_callback", methods=["GET"])
|
| 132 |
def github_callback():
|
| 133 |
+
"""
|
| 134 |
+
GitHub OAuth callback endpoint.
|
| 135 |
+
---
|
| 136 |
+
tags:
|
| 137 |
+
- OAuth
|
| 138 |
+
parameters:
|
| 139 |
+
- in: query
|
| 140 |
+
name: code
|
| 141 |
+
type: string
|
| 142 |
+
required: true
|
| 143 |
+
description: Authorization code from GitHub.
|
| 144 |
+
responses:
|
| 145 |
+
200:
|
| 146 |
+
description: Authentication successful.
|
| 147 |
+
schema:
|
| 148 |
+
type: object
|
| 149 |
+
"""
|
| 150 |
import requests
|
| 151 |
+
|
| 152 |
+
res = requests.post(
|
| 153 |
+
GITHUB_OAUTH.get("url"),
|
| 154 |
+
data={
|
| 155 |
+
"client_id": GITHUB_OAUTH.get("client_id"),
|
| 156 |
+
"client_secret": GITHUB_OAUTH.get("secret_key"),
|
| 157 |
+
"code": request.args.get("code"),
|
| 158 |
+
},
|
| 159 |
+
headers={"Accept": "application/json"},
|
| 160 |
+
)
|
| 161 |
res = res.json()
|
| 162 |
if "error" in res:
|
| 163 |
return redirect("/?error=%s" % res["error_description"])
|
|
|
|
| 179 |
except Exception as e:
|
| 180 |
stat_logger.exception(e)
|
| 181 |
avatar = ""
|
| 182 |
+
users = user_register(
|
| 183 |
+
user_id,
|
| 184 |
+
{
|
| 185 |
+
"access_token": session["access_token"],
|
| 186 |
+
"email": email_address,
|
| 187 |
+
"avatar": avatar,
|
| 188 |
+
"nickname": user_info["login"],
|
| 189 |
+
"login_channel": "github",
|
| 190 |
+
"last_login_time": get_format_time(),
|
| 191 |
+
"is_superuser": False,
|
| 192 |
+
},
|
| 193 |
+
)
|
| 194 |
if not users:
|
| 195 |
+
raise Exception(f"Fail to register {email_address}.")
|
| 196 |
if len(users) > 1:
|
| 197 |
+
raise Exception(f"Same email: {email_address} exists!")
|
| 198 |
|
| 199 |
# Try to log in
|
| 200 |
user = users[0]
|
|
|
|
| 213 |
return redirect("/?auth=%s" % user.get_id())
|
| 214 |
|
| 215 |
|
| 216 |
+
@manager.route("/feishu_callback", methods=["GET"])
|
| 217 |
def feishu_callback():
|
| 218 |
+
"""
|
| 219 |
+
Feishu OAuth callback endpoint.
|
| 220 |
+
---
|
| 221 |
+
tags:
|
| 222 |
+
- OAuth
|
| 223 |
+
parameters:
|
| 224 |
+
- in: query
|
| 225 |
+
name: code
|
| 226 |
+
type: string
|
| 227 |
+
required: true
|
| 228 |
+
description: Authorization code from Feishu.
|
| 229 |
+
responses:
|
| 230 |
+
200:
|
| 231 |
+
description: Authentication successful.
|
| 232 |
+
schema:
|
| 233 |
+
type: object
|
| 234 |
+
"""
|
| 235 |
import requests
|
| 236 |
+
|
| 237 |
+
app_access_token_res = requests.post(
|
| 238 |
+
FEISHU_OAUTH.get("app_access_token_url"),
|
| 239 |
+
data=json.dumps(
|
| 240 |
+
{
|
| 241 |
+
"app_id": FEISHU_OAUTH.get("app_id"),
|
| 242 |
+
"app_secret": FEISHU_OAUTH.get("app_secret"),
|
| 243 |
+
}
|
| 244 |
+
),
|
| 245 |
+
headers={"Content-Type": "application/json; charset=utf-8"},
|
| 246 |
+
)
|
| 247 |
app_access_token_res = app_access_token_res.json()
|
| 248 |
+
if app_access_token_res["code"] != 0:
|
| 249 |
return redirect("/?error=%s" % app_access_token_res)
|
| 250 |
|
| 251 |
+
res = requests.post(
|
| 252 |
+
FEISHU_OAUTH.get("user_access_token_url"),
|
| 253 |
+
data=json.dumps(
|
| 254 |
+
{
|
| 255 |
+
"grant_type": FEISHU_OAUTH.get("grant_type"),
|
| 256 |
+
"code": request.args.get("code"),
|
| 257 |
+
}
|
| 258 |
+
),
|
| 259 |
+
headers={
|
| 260 |
+
"Content-Type": "application/json; charset=utf-8",
|
| 261 |
+
"Authorization": f"Bearer {app_access_token_res['app_access_token']}",
|
| 262 |
+
},
|
| 263 |
+
)
|
| 264 |
res = res.json()
|
| 265 |
+
if res["code"] != 0:
|
| 266 |
return redirect("/?error=%s" % res["message"])
|
| 267 |
|
| 268 |
if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
|
|
|
|
| 281 |
except Exception as e:
|
| 282 |
stat_logger.exception(e)
|
| 283 |
avatar = ""
|
| 284 |
+
users = user_register(
|
| 285 |
+
user_id,
|
| 286 |
+
{
|
| 287 |
+
"access_token": session["access_token"],
|
| 288 |
+
"email": email_address,
|
| 289 |
+
"avatar": avatar,
|
| 290 |
+
"nickname": user_info["en_name"],
|
| 291 |
+
"login_channel": "feishu",
|
| 292 |
+
"last_login_time": get_format_time(),
|
| 293 |
+
"is_superuser": False,
|
| 294 |
+
},
|
| 295 |
+
)
|
| 296 |
if not users:
|
| 297 |
+
raise Exception(f"Fail to register {email_address}.")
|
| 298 |
if len(users) > 1:
|
| 299 |
+
raise Exception(f"Same email: {email_address} exists!")
|
| 300 |
|
| 301 |
# Try to log in
|
| 302 |
user = users[0]
|
|
|
|
| 317 |
|
| 318 |
def user_info_from_feishu(access_token):
|
| 319 |
import requests
|
| 320 |
+
|
| 321 |
+
headers = {
|
| 322 |
+
"Content-Type": "application/json; charset=utf-8",
|
| 323 |
+
"Authorization": f"Bearer {access_token}",
|
| 324 |
+
}
|
| 325 |
res = requests.get(
|
| 326 |
+
f"https://open.feishu.cn/open-apis/authen/v1/user_info", headers=headers
|
| 327 |
+
)
|
| 328 |
user_info = res.json()["data"]
|
| 329 |
user_info["email"] = None if user_info.get("email") == "" else user_info["email"]
|
| 330 |
return user_info
|
|
|
|
| 332 |
|
| 333 |
def user_info_from_github(access_token):
|
| 334 |
import requests
|
| 335 |
+
|
| 336 |
+
headers = {"Accept": "application/json", "Authorization": f"token {access_token}"}
|
| 337 |
res = requests.get(
|
| 338 |
+
f"https://api.github.com/user?access_token={access_token}", headers=headers
|
| 339 |
+
)
|
| 340 |
user_info = res.json()
|
| 341 |
email_info = requests.get(
|
| 342 |
f"https://api.github.com/user/emails?access_token={access_token}",
|
| 343 |
+
headers=headers,
|
| 344 |
+
).json()
|
| 345 |
user_info["email"] = next(
|
| 346 |
+
(email for email in email_info if email["primary"] == True), None
|
| 347 |
+
)["email"]
|
| 348 |
return user_info
|
| 349 |
|
| 350 |
|
| 351 |
+
@manager.route("/logout", methods=["GET"])
|
| 352 |
@login_required
|
| 353 |
def log_out():
|
| 354 |
+
"""
|
| 355 |
+
User logout endpoint.
|
| 356 |
+
---
|
| 357 |
+
tags:
|
| 358 |
+
- User
|
| 359 |
+
security:
|
| 360 |
+
- ApiKeyAuth: []
|
| 361 |
+
responses:
|
| 362 |
+
200:
|
| 363 |
+
description: Logout successful.
|
| 364 |
+
schema:
|
| 365 |
+
type: object
|
| 366 |
+
"""
|
| 367 |
current_user.access_token = ""
|
| 368 |
current_user.save()
|
| 369 |
logout_user()
|
|
|
|
| 373 |
@manager.route("/setting", methods=["POST"])
|
| 374 |
@login_required
|
| 375 |
def setting_user():
|
| 376 |
+
"""
|
| 377 |
+
Update user settings.
|
| 378 |
+
---
|
| 379 |
+
tags:
|
| 380 |
+
- User
|
| 381 |
+
security:
|
| 382 |
+
- ApiKeyAuth: []
|
| 383 |
+
parameters:
|
| 384 |
+
- in: body
|
| 385 |
+
name: body
|
| 386 |
+
description: User settings to update.
|
| 387 |
+
required: true
|
| 388 |
+
schema:
|
| 389 |
+
type: object
|
| 390 |
+
properties:
|
| 391 |
+
nickname:
|
| 392 |
+
type: string
|
| 393 |
+
description: New nickname.
|
| 394 |
+
email:
|
| 395 |
+
type: string
|
| 396 |
+
description: New email.
|
| 397 |
+
responses:
|
| 398 |
+
200:
|
| 399 |
+
description: Settings updated successfully.
|
| 400 |
+
schema:
|
| 401 |
+
type: object
|
| 402 |
+
"""
|
| 403 |
update_dict = {}
|
| 404 |
request_data = request.json
|
| 405 |
if request_data.get("password"):
|
| 406 |
new_password = request_data.get("new_password")
|
| 407 |
if not check_password_hash(
|
| 408 |
+
current_user.password, decrypt(request_data["password"])
|
| 409 |
+
):
|
| 410 |
+
return get_json_result(
|
| 411 |
+
data=False,
|
| 412 |
+
retcode=RetCode.AUTHENTICATION_ERROR,
|
| 413 |
+
retmsg="Password error!",
|
| 414 |
+
)
|
| 415 |
|
| 416 |
if new_password:
|
| 417 |
update_dict["password"] = generate_password_hash(decrypt(new_password))
|
| 418 |
|
| 419 |
for k in request_data.keys():
|
| 420 |
+
if k in [
|
| 421 |
+
"password",
|
| 422 |
+
"new_password",
|
| 423 |
+
"email",
|
| 424 |
+
"status",
|
| 425 |
+
"is_superuser",
|
| 426 |
+
"login_channel",
|
| 427 |
+
"is_anonymous",
|
| 428 |
+
"is_active",
|
| 429 |
+
"is_authenticated",
|
| 430 |
+
"last_login_time",
|
| 431 |
+
]:
|
| 432 |
continue
|
| 433 |
update_dict[k] = request_data[k]
|
| 434 |
|
|
|
|
| 437 |
return get_json_result(data=True)
|
| 438 |
except Exception as e:
|
| 439 |
stat_logger.exception(e)
|
| 440 |
+
return get_json_result(
|
| 441 |
+
data=False, retmsg="Update failure!", retcode=RetCode.EXCEPTION_ERROR
|
| 442 |
+
)
|
| 443 |
|
| 444 |
|
| 445 |
@manager.route("/info", methods=["GET"])
|
| 446 |
@login_required
|
| 447 |
def user_profile():
|
| 448 |
+
"""
|
| 449 |
+
Get user profile information.
|
| 450 |
+
---
|
| 451 |
+
tags:
|
| 452 |
+
- User
|
| 453 |
+
security:
|
| 454 |
+
- ApiKeyAuth: []
|
| 455 |
+
responses:
|
| 456 |
+
200:
|
| 457 |
+
description: User profile retrieved successfully.
|
| 458 |
+
schema:
|
| 459 |
+
type: object
|
| 460 |
+
properties:
|
| 461 |
+
id:
|
| 462 |
+
type: string
|
| 463 |
+
description: User ID.
|
| 464 |
+
nickname:
|
| 465 |
+
type: string
|
| 466 |
+
description: User nickname.
|
| 467 |
+
email:
|
| 468 |
+
type: string
|
| 469 |
+
description: User email.
|
| 470 |
+
"""
|
| 471 |
return get_json_result(data=current_user.to_dict())
|
| 472 |
|
| 473 |
|
|
|
|
| 502 |
"asr_id": ASR_MDL,
|
| 503 |
"parser_ids": PARSERS,
|
| 504 |
"img2txt_id": IMAGE2TEXT_MDL,
|
| 505 |
+
"rerank_id": RERANK_MDL,
|
| 506 |
}
|
| 507 |
usr_tenant = {
|
| 508 |
"tenant_id": user_id,
|
| 509 |
"user_id": user_id,
|
| 510 |
"invited_by": user_id,
|
| 511 |
+
"role": UserTenantRole.OWNER,
|
| 512 |
}
|
| 513 |
file_id = get_uuid()
|
| 514 |
file = {
|
|
|
|
| 523 |
}
|
| 524 |
tenant_llm = []
|
| 525 |
for llm in LLMService.query(fid=LLM_FACTORY):
|
| 526 |
+
tenant_llm.append(
|
| 527 |
+
{
|
| 528 |
+
"tenant_id": user_id,
|
| 529 |
+
"llm_factory": LLM_FACTORY,
|
| 530 |
+
"llm_name": llm.llm_name,
|
| 531 |
+
"model_type": llm.model_type,
|
| 532 |
+
"api_key": API_KEY,
|
| 533 |
+
"api_base": LLM_BASE_URL,
|
| 534 |
+
}
|
| 535 |
+
)
|
| 536 |
|
| 537 |
if not UserService.save(**user):
|
| 538 |
return
|
|
|
|
| 546 |
@manager.route("/register", methods=["POST"])
|
| 547 |
@validate_request("nickname", "email", "password")
|
| 548 |
def user_add():
|
| 549 |
+
"""
|
| 550 |
+
Register a new user.
|
| 551 |
+
---
|
| 552 |
+
tags:
|
| 553 |
+
- User
|
| 554 |
+
parameters:
|
| 555 |
+
- in: body
|
| 556 |
+
name: body
|
| 557 |
+
description: Registration details.
|
| 558 |
+
required: true
|
| 559 |
+
schema:
|
| 560 |
+
type: object
|
| 561 |
+
properties:
|
| 562 |
+
nickname:
|
| 563 |
+
type: string
|
| 564 |
+
description: User nickname.
|
| 565 |
+
email:
|
| 566 |
+
type: string
|
| 567 |
+
description: User email.
|
| 568 |
+
password:
|
| 569 |
+
type: string
|
| 570 |
+
description: User password.
|
| 571 |
+
responses:
|
| 572 |
+
200:
|
| 573 |
+
description: Registration successful.
|
| 574 |
+
schema:
|
| 575 |
+
type: object
|
| 576 |
+
"""
|
| 577 |
req = request.json
|
| 578 |
email_address = req["email"]
|
| 579 |
|
| 580 |
# Validate the email address
|
| 581 |
if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,5}$", email_address):
|
| 582 |
+
return get_json_result(
|
| 583 |
+
data=False,
|
| 584 |
+
retmsg=f"Invalid email address: {email_address}!",
|
| 585 |
+
retcode=RetCode.OPERATING_ERROR,
|
| 586 |
+
)
|
| 587 |
|
| 588 |
# Check if the email address is already used
|
| 589 |
if UserService.query(email=email_address):
|
| 590 |
return get_json_result(
|
| 591 |
data=False,
|
| 592 |
+
retmsg=f"Email: {email_address} has already registered!",
|
| 593 |
+
retcode=RetCode.OPERATING_ERROR,
|
| 594 |
+
)
|
| 595 |
|
| 596 |
# Construct user info data
|
| 597 |
nickname = req["nickname"]
|
|
|
|
| 609 |
try:
|
| 610 |
users = user_register(user_id, user_dict)
|
| 611 |
if not users:
|
| 612 |
+
raise Exception(f"Fail to register {email_address}.")
|
| 613 |
if len(users) > 1:
|
| 614 |
+
raise Exception(f"Same email: {email_address} exists!")
|
| 615 |
user = users[0]
|
| 616 |
login_user(user)
|
| 617 |
+
return construct_response(
|
| 618 |
+
data=user.to_json(),
|
| 619 |
+
auth=user.get_id(),
|
| 620 |
+
retmsg=f"{nickname}, welcome aboard!",
|
| 621 |
+
)
|
| 622 |
except Exception as e:
|
| 623 |
rollback_user_registration(user_id)
|
| 624 |
stat_logger.exception(e)
|
| 625 |
+
return get_json_result(
|
| 626 |
+
data=False,
|
| 627 |
+
retmsg=f"User registration failure, error: {str(e)}",
|
| 628 |
+
retcode=RetCode.EXCEPTION_ERROR,
|
| 629 |
+
)
|
| 630 |
|
| 631 |
|
| 632 |
@manager.route("/tenant_info", methods=["GET"])
|
| 633 |
@login_required
|
| 634 |
def tenant_info():
|
| 635 |
+
"""
|
| 636 |
+
Get tenant information.
|
| 637 |
+
---
|
| 638 |
+
tags:
|
| 639 |
+
- Tenant
|
| 640 |
+
security:
|
| 641 |
+
- ApiKeyAuth: []
|
| 642 |
+
responses:
|
| 643 |
+
200:
|
| 644 |
+
description: Tenant information retrieved successfully.
|
| 645 |
+
schema:
|
| 646 |
+
type: object
|
| 647 |
+
properties:
|
| 648 |
+
tenant_id:
|
| 649 |
+
type: string
|
| 650 |
+
description: Tenant ID.
|
| 651 |
+
name:
|
| 652 |
+
type: string
|
| 653 |
+
description: Tenant name.
|
| 654 |
+
llm_id:
|
| 655 |
+
type: string
|
| 656 |
+
description: LLM ID.
|
| 657 |
+
embd_id:
|
| 658 |
+
type: string
|
| 659 |
+
description: Embedding model ID.
|
| 660 |
+
"""
|
| 661 |
try:
|
| 662 |
tenants = TenantService.get_info_by(current_user.id)
|
| 663 |
if not tenants:
|
|
|
|
| 671 |
@login_required
|
| 672 |
@validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id")
|
| 673 |
def set_tenant_info():
|
| 674 |
+
"""
|
| 675 |
+
Update tenant information.
|
| 676 |
+
---
|
| 677 |
+
tags:
|
| 678 |
+
- Tenant
|
| 679 |
+
security:
|
| 680 |
+
- ApiKeyAuth: []
|
| 681 |
+
parameters:
|
| 682 |
+
- in: body
|
| 683 |
+
name: body
|
| 684 |
+
description: Tenant information to update.
|
| 685 |
+
required: true
|
| 686 |
+
schema:
|
| 687 |
+
type: object
|
| 688 |
+
properties:
|
| 689 |
+
tenant_id:
|
| 690 |
+
type: string
|
| 691 |
+
description: Tenant ID.
|
| 692 |
+
llm_id:
|
| 693 |
+
type: string
|
| 694 |
+
description: LLM ID.
|
| 695 |
+
embd_id:
|
| 696 |
+
type: string
|
| 697 |
+
description: Embedding model ID.
|
| 698 |
+
asr_id:
|
| 699 |
+
type: string
|
| 700 |
+
description: ASR model ID.
|
| 701 |
+
img2txt_id:
|
| 702 |
+
type: string
|
| 703 |
+
description: Image to Text model ID.
|
| 704 |
+
responses:
|
| 705 |
+
200:
|
| 706 |
+
description: Tenant information updated successfully.
|
| 707 |
+
schema:
|
| 708 |
+
type: object
|
| 709 |
+
"""
|
| 710 |
req = request.json
|
| 711 |
try:
|
| 712 |
tid = req["tenant_id"]
|
api/ragflow_server.py
CHANGED
|
@@ -27,7 +27,11 @@ from api.apps import app
|
|
| 27 |
from api.db.runtime_config import RuntimeConfig
|
| 28 |
from api.db.services.document_service import DocumentService
|
| 29 |
from api.settings import (
|
| 30 |
-
HOST,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
from api import utils
|
| 33 |
|
|
@@ -45,27 +49,33 @@ def update_progress():
|
|
| 45 |
stat_logger.error("update_progress exception:" + str(e))
|
| 46 |
|
| 47 |
|
| 48 |
-
if __name__ ==
|
| 49 |
-
print(
|
|
|
|
| 50 |
____ ___ ______ ______ __
|
| 51 |
/ __ \ / | / ____// ____// /____ _ __
|
| 52 |
/ /_/ // /| | / / __ / /_ / // __ \| | /| / /
|
| 53 |
/ _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
|
| 54 |
/_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
|
| 55 |
|
| 56 |
-
""",
|
| 57 |
-
|
| 58 |
-
f'project base: {utils.file_utils.get_project_base_directory()}'
|
| 59 |
)
|
|
|
|
| 60 |
|
| 61 |
# init db
|
| 62 |
init_web_db()
|
| 63 |
init_web_data()
|
| 64 |
# init runtime config
|
| 65 |
import argparse
|
|
|
|
| 66 |
parser = argparse.ArgumentParser()
|
| 67 |
-
parser.add_argument(
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
args = parser.parse_args()
|
| 70 |
if args.version:
|
| 71 |
print(get_versions())
|
|
@@ -78,7 +88,7 @@ if __name__ == '__main__':
|
|
| 78 |
RuntimeConfig.init_env()
|
| 79 |
RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
|
| 80 |
|
| 81 |
-
peewee_logger = logging.getLogger(
|
| 82 |
peewee_logger.propagate = False
|
| 83 |
# rag_arch.common.log.ROpenHandler
|
| 84 |
peewee_logger.addHandler(database_logger.handlers[0])
|
|
@@ -93,7 +103,14 @@ if __name__ == '__main__':
|
|
| 93 |
werkzeug_logger = logging.getLogger("werkzeug")
|
| 94 |
for h in access_logger.handlers:
|
| 95 |
werkzeug_logger.addHandler(h)
|
| 96 |
-
run_simple(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
except Exception:
|
| 98 |
traceback.print_exc()
|
| 99 |
-
os.kill(os.getpid(), signal.SIGKILL)
|
|
|
|
| 27 |
from api.db.runtime_config import RuntimeConfig
|
| 28 |
from api.db.services.document_service import DocumentService
|
| 29 |
from api.settings import (
|
| 30 |
+
HOST,
|
| 31 |
+
HTTP_PORT,
|
| 32 |
+
access_logger,
|
| 33 |
+
database_logger,
|
| 34 |
+
stat_logger,
|
| 35 |
)
|
| 36 |
from api import utils
|
| 37 |
|
|
|
|
| 49 |
stat_logger.error("update_progress exception:" + str(e))
|
| 50 |
|
| 51 |
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
print(
|
| 54 |
+
r"""
|
| 55 |
____ ___ ______ ______ __
|
| 56 |
/ __ \ / | / ____// ____// /____ _ __
|
| 57 |
/ /_/ // /| | / / __ / /_ / // __ \| | /| / /
|
| 58 |
/ _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ /
|
| 59 |
/_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/
|
| 60 |
|
| 61 |
+
""",
|
| 62 |
+
flush=True,
|
|
|
|
| 63 |
)
|
| 64 |
+
stat_logger.info(f"project base: {utils.file_utils.get_project_base_directory()}")
|
| 65 |
|
| 66 |
# init db
|
| 67 |
init_web_db()
|
| 68 |
init_web_data()
|
| 69 |
# init runtime config
|
| 70 |
import argparse
|
| 71 |
+
|
| 72 |
parser = argparse.ArgumentParser()
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--version", default=False, help="rag flow version", action="store_true"
|
| 75 |
+
)
|
| 76 |
+
parser.add_argument(
|
| 77 |
+
"--debug", default=False, help="debug mode", action="store_true"
|
| 78 |
+
)
|
| 79 |
args = parser.parse_args()
|
| 80 |
if args.version:
|
| 81 |
print(get_versions())
|
|
|
|
| 88 |
RuntimeConfig.init_env()
|
| 89 |
RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT)
|
| 90 |
|
| 91 |
+
peewee_logger = logging.getLogger("peewee")
|
| 92 |
peewee_logger.propagate = False
|
| 93 |
# rag_arch.common.log.ROpenHandler
|
| 94 |
peewee_logger.addHandler(database_logger.handlers[0])
|
|
|
|
| 103 |
werkzeug_logger = logging.getLogger("werkzeug")
|
| 104 |
for h in access_logger.handlers:
|
| 105 |
werkzeug_logger.addHandler(h)
|
| 106 |
+
run_simple(
|
| 107 |
+
hostname=HOST,
|
| 108 |
+
port=HTTP_PORT,
|
| 109 |
+
application=app,
|
| 110 |
+
threaded=True,
|
| 111 |
+
use_reloader=RuntimeConfig.DEBUG,
|
| 112 |
+
use_debugger=RuntimeConfig.DEBUG,
|
| 113 |
+
)
|
| 114 |
except Exception:
|
| 115 |
traceback.print_exc()
|
| 116 |
+
os.kill(os.getpid(), signal.SIGKILL)
|
poetry.lock
CHANGED
|
@@ -435,6 +435,17 @@ files = [
|
|
| 435 |
{file = "Aspose.Slides-24.10.0-py3-none-win_amd64.whl", hash = "sha256:8980015fbc32c1e70e80444c70a642597511300ead6b352183bf74ba3da67f2d"},
|
| 436 |
]
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
[[package]]
|
| 439 |
name = "attrs"
|
| 440 |
version = "24.2.0"
|
|
@@ -1912,7 +1923,10 @@ files = [
|
|
| 1912 |
huggingface-hub = ">=0.20,<1.0"
|
| 1913 |
loguru = ">=0.7.2,<0.8.0"
|
| 1914 |
mmh3 = ">=4.0,<5.0"
|
| 1915 |
-
numpy =
|
|
|
|
|
|
|
|
|
|
| 1916 |
onnx = ">=1.15.0,<2.0.0"
|
| 1917 |
onnxruntime = ">=1.17.0,<2.0.0"
|
| 1918 |
pillow = ">=10.3.0,<11.0.0"
|
|
@@ -2037,6 +2051,24 @@ sentence_transformers = "*"
|
|
| 2037 |
torch = ">=1.6.0"
|
| 2038 |
transformers = ">=4.33.0"
|
| 2039 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2040 |
[[package]]
|
| 2041 |
name = "flask"
|
| 2042 |
version = "3.0.3"
|
|
@@ -4381,6 +4413,17 @@ httpx = ">=0.25,<1"
|
|
| 4381 |
orjson = ">=3.9.10,<3.11"
|
| 4382 |
pydantic = ">=2.5.2,<3"
|
| 4383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4384 |
[[package]]
|
| 4385 |
name = "mkl"
|
| 4386 |
version = "2021.4.0"
|
|
@@ -5149,7 +5192,10 @@ files = [
|
|
| 5149 |
]
|
| 5150 |
|
| 5151 |
[package.dependencies]
|
| 5152 |
-
numpy =
|
|
|
|
|
|
|
|
|
|
| 5153 |
|
| 5154 |
[[package]]
|
| 5155 |
name = "opencv-python-headless"
|
|
@@ -5168,7 +5214,10 @@ files = [
|
|
| 5168 |
]
|
| 5169 |
|
| 5170 |
[package.dependencies]
|
| 5171 |
-
numpy =
|
|
|
|
|
|
|
|
|
|
| 5172 |
|
| 5173 |
[[package]]
|
| 5174 |
name = "openpyxl"
|
|
@@ -5350,7 +5399,10 @@ files = [
|
|
| 5350 |
]
|
| 5351 |
|
| 5352 |
[package.dependencies]
|
| 5353 |
-
numpy =
|
|
|
|
|
|
|
|
|
|
| 5354 |
python-dateutil = ">=2.8.2"
|
| 5355 |
pytz = ">=2020.1"
|
| 5356 |
tzdata = ">=2022.7"
|
|
@@ -7009,6 +7061,24 @@ lxml = "*"
|
|
| 7009 |
[package.extras]
|
| 7010 |
test = ["timeout-decorator"]
|
| 7011 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7012 |
[[package]]
|
| 7013 |
name = "referencing"
|
| 7014 |
version = "0.35.1"
|
|
@@ -8468,6 +8538,7 @@ nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"
|
|
| 8468 |
nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 8469 |
nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 8470 |
sympy = "*"
|
|
|
|
| 8471 |
typing-extensions = ">=4.8.0"
|
| 8472 |
|
| 8473 |
[package.extras]
|
|
@@ -8611,6 +8682,29 @@ files = [
|
|
| 8611 |
trio = ">=0.11"
|
| 8612 |
wsproto = ">=0.14"
|
| 8613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8614 |
[[package]]
|
| 8615 |
name = "typer"
|
| 8616 |
version = "0.12.5"
|
|
@@ -9446,5 +9540,5 @@ files = [
|
|
| 9446 |
|
| 9447 |
[metadata]
|
| 9448 |
lock-version = "2.0"
|
| 9449 |
-
python-versions = ">=3.
|
| 9450 |
-
content-hash = "
|
|
|
|
| 435 |
{file = "Aspose.Slides-24.10.0-py3-none-win_amd64.whl", hash = "sha256:8980015fbc32c1e70e80444c70a642597511300ead6b352183bf74ba3da67f2d"},
|
| 436 |
]
|
| 437 |
|
| 438 |
+
[[package]]
|
| 439 |
+
name = "async-timeout"
|
| 440 |
+
version = "4.0.3"
|
| 441 |
+
description = "Timeout context manager for asyncio programs"
|
| 442 |
+
optional = false
|
| 443 |
+
python-versions = ">=3.7"
|
| 444 |
+
files = [
|
| 445 |
+
{file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
|
| 446 |
+
{file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
|
| 447 |
+
]
|
| 448 |
+
|
| 449 |
[[package]]
|
| 450 |
name = "attrs"
|
| 451 |
version = "24.2.0"
|
|
|
|
| 1923 |
huggingface-hub = ">=0.20,<1.0"
|
| 1924 |
loguru = ">=0.7.2,<0.8.0"
|
| 1925 |
mmh3 = ">=4.0,<5.0"
|
| 1926 |
+
numpy = [
|
| 1927 |
+
{version = ">=1.21,<2", markers = "python_version < \"3.12\""},
|
| 1928 |
+
{version = ">=1.26,<2", markers = "python_version >= \"3.12\""},
|
| 1929 |
+
]
|
| 1930 |
onnx = ">=1.15.0,<2.0.0"
|
| 1931 |
onnxruntime = ">=1.17.0,<2.0.0"
|
| 1932 |
pillow = ">=10.3.0,<11.0.0"
|
|
|
|
| 2051 |
torch = ">=1.6.0"
|
| 2052 |
transformers = ">=4.33.0"
|
| 2053 |
|
| 2054 |
+
[[package]]
|
| 2055 |
+
name = "flasgger"
|
| 2056 |
+
version = "0.9.7.1"
|
| 2057 |
+
description = "Extract swagger specs from your flask project"
|
| 2058 |
+
optional = false
|
| 2059 |
+
python-versions = "*"
|
| 2060 |
+
files = [
|
| 2061 |
+
{file = "flasgger-0.9.7.1.tar.gz", hash = "sha256:ca098e10bfbb12f047acc6299cc70a33851943a746e550d86e65e60d4df245fb"},
|
| 2062 |
+
]
|
| 2063 |
+
|
| 2064 |
+
[package.dependencies]
|
| 2065 |
+
Flask = ">=0.10"
|
| 2066 |
+
jsonschema = ">=3.0.1"
|
| 2067 |
+
mistune = "*"
|
| 2068 |
+
packaging = "*"
|
| 2069 |
+
PyYAML = ">=3.0"
|
| 2070 |
+
six = ">=1.10.0"
|
| 2071 |
+
|
| 2072 |
[[package]]
|
| 2073 |
name = "flask"
|
| 2074 |
version = "3.0.3"
|
|
|
|
| 4413 |
orjson = ">=3.9.10,<3.11"
|
| 4414 |
pydantic = ">=2.5.2,<3"
|
| 4415 |
|
| 4416 |
+
[[package]]
|
| 4417 |
+
name = "mistune"
|
| 4418 |
+
version = "3.0.2"
|
| 4419 |
+
description = "A sane and fast Markdown parser with useful plugins and renderers"
|
| 4420 |
+
optional = false
|
| 4421 |
+
python-versions = ">=3.7"
|
| 4422 |
+
files = [
|
| 4423 |
+
{file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"},
|
| 4424 |
+
{file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"},
|
| 4425 |
+
]
|
| 4426 |
+
|
| 4427 |
[[package]]
|
| 4428 |
name = "mkl"
|
| 4429 |
version = "2021.4.0"
|
|
|
|
| 5192 |
]
|
| 5193 |
|
| 5194 |
[package.dependencies]
|
| 5195 |
+
numpy = [
|
| 5196 |
+
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
| 5197 |
+
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
| 5198 |
+
]
|
| 5199 |
|
| 5200 |
[[package]]
|
| 5201 |
name = "opencv-python-headless"
|
|
|
|
| 5214 |
]
|
| 5215 |
|
| 5216 |
[package.dependencies]
|
| 5217 |
+
numpy = [
|
| 5218 |
+
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
| 5219 |
+
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
| 5220 |
+
]
|
| 5221 |
|
| 5222 |
[[package]]
|
| 5223 |
name = "openpyxl"
|
|
|
|
| 5399 |
]
|
| 5400 |
|
| 5401 |
[package.dependencies]
|
| 5402 |
+
numpy = [
|
| 5403 |
+
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
| 5404 |
+
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
| 5405 |
+
]
|
| 5406 |
python-dateutil = ">=2.8.2"
|
| 5407 |
pytz = ">=2020.1"
|
| 5408 |
tzdata = ">=2022.7"
|
|
|
|
| 7061 |
[package.extras]
|
| 7062 |
test = ["timeout-decorator"]
|
| 7063 |
|
| 7064 |
+
[[package]]
|
| 7065 |
+
name = "redis"
|
| 7066 |
+
version = "5.0.3"
|
| 7067 |
+
description = "Python client for Redis database and key-value store"
|
| 7068 |
+
optional = false
|
| 7069 |
+
python-versions = ">=3.7"
|
| 7070 |
+
files = [
|
| 7071 |
+
{file = "redis-5.0.3-py3-none-any.whl", hash = "sha256:5da9b8fe9e1254293756c16c008e8620b3d15fcc6dde6babde9541850e72a32d"},
|
| 7072 |
+
{file = "redis-5.0.3.tar.gz", hash = "sha256:4973bae7444c0fbed64a06b87446f79361cb7e4ec1538c022d696ed7a5015580"},
|
| 7073 |
+
]
|
| 7074 |
+
|
| 7075 |
+
[package.dependencies]
|
| 7076 |
+
async-timeout = {version = ">=4.0.3", markers = "python_full_version < \"3.11.3\""}
|
| 7077 |
+
|
| 7078 |
+
[package.extras]
|
| 7079 |
+
hiredis = ["hiredis (>=1.0.0)"]
|
| 7080 |
+
ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"]
|
| 7081 |
+
|
| 7082 |
[[package]]
|
| 7083 |
name = "referencing"
|
| 7084 |
version = "0.35.1"
|
|
|
|
| 8538 |
nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 8539 |
nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 8540 |
sympy = "*"
|
| 8541 |
+
triton = {version = "2.3.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.12\""}
|
| 8542 |
typing-extensions = ">=4.8.0"
|
| 8543 |
|
| 8544 |
[package.extras]
|
|
|
|
| 8682 |
trio = ">=0.11"
|
| 8683 |
wsproto = ">=0.14"
|
| 8684 |
|
| 8685 |
+
[[package]]
|
| 8686 |
+
name = "triton"
|
| 8687 |
+
version = "2.3.0"
|
| 8688 |
+
description = "A language and compiler for custom Deep Learning operations"
|
| 8689 |
+
optional = false
|
| 8690 |
+
python-versions = "*"
|
| 8691 |
+
files = [
|
| 8692 |
+
{file = "triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ce4b8ff70c48e47274c66f269cce8861cf1dc347ceeb7a67414ca151b1822d8"},
|
| 8693 |
+
{file = "triton-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c3d9607f85103afdb279938fc1dd2a66e4f5999a58eb48a346bd42738f986dd"},
|
| 8694 |
+
{file = "triton-2.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:218d742e67480d9581bafb73ed598416cc8a56f6316152e5562ee65e33de01c0"},
|
| 8695 |
+
{file = "triton-2.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381ec6b3dac06922d3e4099cfc943ef032893b25415de295e82b1a82b0359d2c"},
|
| 8696 |
+
{file = "triton-2.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:038e06a09c06a164fef9c48de3af1e13a63dc1ba3c792871e61a8e79720ea440"},
|
| 8697 |
+
{file = "triton-2.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8f636e0341ac348899a47a057c3daea99ea7db31528a225a3ba4ded28ccc65"},
|
| 8698 |
+
]
|
| 8699 |
+
|
| 8700 |
+
[package.dependencies]
|
| 8701 |
+
filelock = "*"
|
| 8702 |
+
|
| 8703 |
+
[package.extras]
|
| 8704 |
+
build = ["cmake (>=3.20)", "lit"]
|
| 8705 |
+
tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"]
|
| 8706 |
+
tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
|
| 8707 |
+
|
| 8708 |
[[package]]
|
| 8709 |
name = "typer"
|
| 8710 |
version = "0.12.5"
|
|
|
|
| 9540 |
|
| 9541 |
[metadata]
|
| 9542 |
lock-version = "2.0"
|
| 9543 |
+
python-versions = ">=3.11,<3.13"
|
| 9544 |
+
content-hash = "74a9b4afef47cc36d638b43fd918ece27d65259af1ca9e5b17f6b239774e8bf9"
|
pyproject.toml
CHANGED
|
@@ -8,7 +8,7 @@ readme = "README.md"
|
|
| 8 |
package-mode = false
|
| 9 |
|
| 10 |
[tool.poetry.dependencies]
|
| 11 |
-
python = ">=3.
|
| 12 |
datrie = "0.8.2"
|
| 13 |
akshare = "^1.14.81"
|
| 14 |
azure-storage-blob = "12.22.0"
|
|
@@ -114,6 +114,7 @@ graspologic = "^3.4.1"
|
|
| 114 |
pymysql = "^1.1.1"
|
| 115 |
mini-racer = "^0.12.4"
|
| 116 |
pyicu = "^2.13.1"
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
[tool.poetry.group.full]
|
|
|
|
| 8 |
package-mode = false
|
| 9 |
|
| 10 |
[tool.poetry.dependencies]
|
| 11 |
+
python = ">=3.11,<3.13"
|
| 12 |
datrie = "0.8.2"
|
| 13 |
akshare = "^1.14.81"
|
| 14 |
azure-storage-blob = "12.22.0"
|
|
|
|
| 114 |
pymysql = "^1.1.1"
|
| 115 |
mini-racer = "^0.12.4"
|
| 116 |
pyicu = "^2.13.1"
|
| 117 |
+
flasgger = "^0.9.7.1"
|
| 118 |
|
| 119 |
|
| 120 |
[tool.poetry.group.full]
|