KevinHuSh
commited on
Commit
·
83a0020
1
Parent(s):
87a2c48
refactor (#1124)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- api/apps/__init__.py +0 -1
- api/apps/document_app.py +63 -66
- api/db/services/dialog_service.py +4 -3
- api/utils/api_utils.py +2 -1
- api/utils/log_utils.py +0 -5
- api/utils/web_utils.py +0 -2
- rag/llm/embedding_model.py +4 -5
api/apps/__init__.py
CHANGED
|
@@ -85,7 +85,6 @@ def register_page(page_path):
|
|
| 85 |
url_prefix = f'/api/{API_VERSION}/{page_name}' if "_api" in path else f'/{API_VERSION}/{page_name}'
|
| 86 |
|
| 87 |
app.register_blueprint(page.manager, url_prefix=url_prefix)
|
| 88 |
-
print(f'API file: {page_path}, URL: {url_prefix}')
|
| 89 |
return url_prefix
|
| 90 |
|
| 91 |
|
|
|
|
| 85 |
url_prefix = f'/api/{API_VERSION}/{page_name}' if "_api" in path else f'/{API_VERSION}/{page_name}'
|
| 86 |
|
| 87 |
app.register_blueprint(page.manager, url_prefix=url_prefix)
|
|
|
|
| 88 |
return url_prefix
|
| 89 |
|
| 90 |
|
api/apps/document_app.py
CHANGED
|
@@ -40,6 +40,7 @@ from api.utils.api_utils import get_json_result
|
|
| 40 |
from rag.utils.minio_conn import MINIO
|
| 41 |
from api.utils.file_utils import filename_type, thumbnail
|
| 42 |
from api.utils.web_utils import html2pdf, is_valid_url
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
@manager.route('/upload', methods=['POST'])
|
|
@@ -117,6 +118,68 @@ def upload():
|
|
| 117 |
return get_json_result(data=True)
|
| 118 |
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
@manager.route('/create', methods=['POST'])
|
| 121 |
@login_required
|
| 122 |
@validate_request("name", "kb_id")
|
|
@@ -417,69 +480,3 @@ def get_image(image_id):
|
|
| 417 |
return response
|
| 418 |
except Exception as e:
|
| 419 |
return server_error_response(e)
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
@manager.route('/web_crawl', methods=['POST'])
|
| 423 |
-
@login_required
|
| 424 |
-
def web_crawl():
|
| 425 |
-
kb_id = request.form.get("kb_id")
|
| 426 |
-
if not kb_id:
|
| 427 |
-
return get_json_result(
|
| 428 |
-
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
| 429 |
-
name = request.form.get("name")
|
| 430 |
-
url = request.form.get("url")
|
| 431 |
-
if not name:
|
| 432 |
-
return get_json_result(
|
| 433 |
-
data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
|
| 434 |
-
if not url:
|
| 435 |
-
return get_json_result(
|
| 436 |
-
data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
|
| 437 |
-
if not is_valid_url(url):
|
| 438 |
-
return get_json_result(
|
| 439 |
-
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
|
| 440 |
-
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
| 441 |
-
if not e:
|
| 442 |
-
raise LookupError("Can't find this knowledgebase!")
|
| 443 |
-
|
| 444 |
-
root_folder = FileService.get_root_folder(current_user.id)
|
| 445 |
-
pf_id = root_folder["id"]
|
| 446 |
-
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
| 447 |
-
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
| 448 |
-
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
| 449 |
-
|
| 450 |
-
try:
|
| 451 |
-
filename = duplicate_name(
|
| 452 |
-
DocumentService.query,
|
| 453 |
-
name=name+".pdf",
|
| 454 |
-
kb_id=kb.id)
|
| 455 |
-
filetype = filename_type(filename)
|
| 456 |
-
if filetype == FileType.OTHER.value:
|
| 457 |
-
raise RuntimeError("This type of file has not been supported yet!")
|
| 458 |
-
|
| 459 |
-
location = filename
|
| 460 |
-
while MINIO.obj_exist(kb_id, location):
|
| 461 |
-
location += "_"
|
| 462 |
-
blob = html2pdf(url)
|
| 463 |
-
MINIO.put(kb_id, location, blob)
|
| 464 |
-
doc = {
|
| 465 |
-
"id": get_uuid(),
|
| 466 |
-
"kb_id": kb.id,
|
| 467 |
-
"parser_id": kb.parser_id,
|
| 468 |
-
"parser_config": kb.parser_config,
|
| 469 |
-
"created_by": current_user.id,
|
| 470 |
-
"type": filetype,
|
| 471 |
-
"name": filename,
|
| 472 |
-
"location": location,
|
| 473 |
-
"size": len(blob),
|
| 474 |
-
"thumbnail": thumbnail(filename, blob)
|
| 475 |
-
}
|
| 476 |
-
if doc["type"] == FileType.VISUAL:
|
| 477 |
-
doc["parser_id"] = ParserType.PICTURE.value
|
| 478 |
-
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
| 479 |
-
doc["parser_id"] = ParserType.PRESENTATION.value
|
| 480 |
-
DocumentService.insert(doc)
|
| 481 |
-
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
| 482 |
-
except Exception as e:
|
| 483 |
-
return get_json_result(
|
| 484 |
-
data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
|
| 485 |
-
return get_json_result(data=True)
|
|
|
|
| 40 |
from rag.utils.minio_conn import MINIO
|
| 41 |
from api.utils.file_utils import filename_type, thumbnail
|
| 42 |
from api.utils.web_utils import html2pdf, is_valid_url
|
| 43 |
+
from api.utils.web_utils import html2pdf, is_valid_url
|
| 44 |
|
| 45 |
|
| 46 |
@manager.route('/upload', methods=['POST'])
|
|
|
|
| 118 |
return get_json_result(data=True)
|
| 119 |
|
| 120 |
|
| 121 |
+
@manager.route('/web_crawl', methods=['POST'])
|
| 122 |
+
@login_required
|
| 123 |
+
@validate_request("kb_id", "name", "url")
|
| 124 |
+
def web_crawl():
|
| 125 |
+
kb_id = request.form.get("kb_id")
|
| 126 |
+
if not kb_id:
|
| 127 |
+
return get_json_result(
|
| 128 |
+
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
| 129 |
+
name = request.form.get("name")
|
| 130 |
+
url = request.form.get("url")
|
| 131 |
+
if not is_valid_url(url):
|
| 132 |
+
return get_json_result(
|
| 133 |
+
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
|
| 134 |
+
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
| 135 |
+
if not e:
|
| 136 |
+
raise LookupError("Can't find this knowledgebase!")
|
| 137 |
+
|
| 138 |
+
blob = html2pdf(url)
|
| 139 |
+
if not blob: return server_error_response(ValueError("Download failure."))
|
| 140 |
+
|
| 141 |
+
root_folder = FileService.get_root_folder(current_user.id)
|
| 142 |
+
pf_id = root_folder["id"]
|
| 143 |
+
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
| 144 |
+
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
| 145 |
+
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
filename = duplicate_name(
|
| 149 |
+
DocumentService.query,
|
| 150 |
+
name=name+".pdf",
|
| 151 |
+
kb_id=kb.id)
|
| 152 |
+
filetype = filename_type(filename)
|
| 153 |
+
if filetype == FileType.OTHER.value:
|
| 154 |
+
raise RuntimeError("This type of file has not been supported yet!")
|
| 155 |
+
|
| 156 |
+
location = filename
|
| 157 |
+
while MINIO.obj_exist(kb_id, location):
|
| 158 |
+
location += "_"
|
| 159 |
+
MINIO.put(kb_id, location, blob)
|
| 160 |
+
doc = {
|
| 161 |
+
"id": get_uuid(),
|
| 162 |
+
"kb_id": kb.id,
|
| 163 |
+
"parser_id": kb.parser_id,
|
| 164 |
+
"parser_config": kb.parser_config,
|
| 165 |
+
"created_by": current_user.id,
|
| 166 |
+
"type": filetype,
|
| 167 |
+
"name": filename,
|
| 168 |
+
"location": location,
|
| 169 |
+
"size": len(blob),
|
| 170 |
+
"thumbnail": thumbnail(filename, blob)
|
| 171 |
+
}
|
| 172 |
+
if doc["type"] == FileType.VISUAL:
|
| 173 |
+
doc["parser_id"] = ParserType.PICTURE.value
|
| 174 |
+
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
| 175 |
+
doc["parser_id"] = ParserType.PRESENTATION.value
|
| 176 |
+
DocumentService.insert(doc)
|
| 177 |
+
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
| 178 |
+
except Exception as e:
|
| 179 |
+
return server_error_response(e)
|
| 180 |
+
return get_json_result(data=True)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
@manager.route('/create', methods=['POST'])
|
| 184 |
@login_required
|
| 185 |
@validate_request("name", "kb_id")
|
|
|
|
| 480 |
return response
|
| 481 |
except Exception as e:
|
| 482 |
return server_error_response(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/db/services/dialog_service.py
CHANGED
|
@@ -112,14 +112,15 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
| 112 |
prompt_config["system"] = prompt_config["system"].replace(
|
| 113 |
"{%s}" % p["key"], " ")
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
for _ in range(len(questions) // 2):
|
| 116 |
questions.append(questions[-1])
|
| 117 |
if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
|
| 118 |
kbinfos = {"total": 0, "chunks": [], "doc_aggs": []}
|
| 119 |
else:
|
| 120 |
-
rerank_mdl = None
|
| 121 |
-
if dialog.rerank_id:
|
| 122 |
-
rerank_mdl = LLMBundle(dialog.tenant_id, LLMType.RERANK, dialog.rerank_id)
|
| 123 |
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
| 124 |
dialog.similarity_threshold,
|
| 125 |
dialog.vector_similarity_weight,
|
|
|
|
| 112 |
prompt_config["system"] = prompt_config["system"].replace(
|
| 113 |
"{%s}" % p["key"], " ")
|
| 114 |
|
| 115 |
+
rerank_mdl = None
|
| 116 |
+
if dialog.rerank_id:
|
| 117 |
+
rerank_mdl = LLMBundle(dialog.tenant_id, LLMType.RERANK, dialog.rerank_id)
|
| 118 |
+
|
| 119 |
for _ in range(len(questions) // 2):
|
| 120 |
questions.append(questions[-1])
|
| 121 |
if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
|
| 122 |
kbinfos = {"total": 0, "chunks": [], "doc_aggs": []}
|
| 123 |
else:
|
|
|
|
|
|
|
|
|
|
| 124 |
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
| 125 |
dialog.similarity_threshold,
|
| 126 |
dialog.vector_similarity_weight,
|
api/utils/api_utils.py
CHANGED
|
@@ -248,11 +248,12 @@ def construct_result(code=RetCode.DATA_ERROR, message='data is missing'):
|
|
| 248 |
|
| 249 |
|
| 250 |
def construct_json_result(code=RetCode.SUCCESS, message='success', data=None):
|
| 251 |
-
if data
|
| 252 |
return jsonify({"code": code, "message": message})
|
| 253 |
else:
|
| 254 |
return jsonify({"code": code, "message": message, "data": data})
|
| 255 |
|
|
|
|
| 256 |
def construct_error_response(e):
|
| 257 |
stat_logger.exception(e)
|
| 258 |
try:
|
|
|
|
| 248 |
|
| 249 |
|
| 250 |
def construct_json_result(code=RetCode.SUCCESS, message='success', data=None):
|
| 251 |
+
if data is None:
|
| 252 |
return jsonify({"code": code, "message": message})
|
| 253 |
else:
|
| 254 |
return jsonify({"code": code, "message": message, "data": data})
|
| 255 |
|
| 256 |
+
|
| 257 |
def construct_error_response(e):
|
| 258 |
stat_logger.exception(e)
|
| 259 |
try:
|
api/utils/log_utils.py
CHANGED
|
@@ -154,11 +154,6 @@ class LoggerFactory(object):
|
|
| 154 |
delay=True)
|
| 155 |
if level:
|
| 156 |
handler.level = level
|
| 157 |
-
else:
|
| 158 |
-
handler.level = LoggerFactory.LEVEL
|
| 159 |
-
|
| 160 |
-
formatter = logging.Formatter(LoggerFactory.LOG_FORMAT)
|
| 161 |
-
handler.setFormatter(formatter)
|
| 162 |
|
| 163 |
return handler
|
| 164 |
|
|
|
|
| 154 |
delay=True)
|
| 155 |
if level:
|
| 156 |
handler.level = level
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
return handler
|
| 159 |
|
api/utils/web_utils.py
CHANGED
|
@@ -78,5 +78,3 @@ def __get_pdf_from_html(
|
|
| 78 |
|
| 79 |
def is_valid_url(url: str) -> bool:
|
| 80 |
return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 78 |
|
| 79 |
def is_valid_url(url: str) -> bool:
|
| 80 |
return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
|
|
|
|
|
|
rag/llm/embedding_model.py
CHANGED
|
@@ -26,9 +26,8 @@ import dashscope
|
|
| 26 |
from openai import OpenAI
|
| 27 |
from FlagEmbedding import FlagModel
|
| 28 |
import torch
|
| 29 |
-
import asyncio
|
| 30 |
import numpy as np
|
| 31 |
-
|
| 32 |
from api.utils.file_utils import get_home_cache_dir
|
| 33 |
from rag.utils import num_tokens_from_string, truncate
|
| 34 |
|
|
@@ -317,12 +316,12 @@ class InfinityEmbed(Base):
|
|
| 317 |
engine_kwargs: dict = {},
|
| 318 |
key = None,
|
| 319 |
):
|
| 320 |
-
|
| 321 |
from infinity_emb import EngineArgs
|
| 322 |
from infinity_emb.engine import AsyncEngineArray
|
| 323 |
-
|
| 324 |
self._default_model = model_names[0]
|
| 325 |
-
self.engine_array = AsyncEngineArray.from_args([EngineArgs(model_name_or_path = model_name, **engine_kwargs) for model_name in model_names])
|
| 326 |
|
| 327 |
async def _embed(self, sentences: list[str], model_name: str = ""):
|
| 328 |
if not model_name:
|
|
|
|
| 26 |
from openai import OpenAI
|
| 27 |
from FlagEmbedding import FlagModel
|
| 28 |
import torch
|
|
|
|
| 29 |
import numpy as np
|
| 30 |
+
import asyncio
|
| 31 |
from api.utils.file_utils import get_home_cache_dir
|
| 32 |
from rag.utils import num_tokens_from_string, truncate
|
| 33 |
|
|
|
|
| 316 |
engine_kwargs: dict = {},
|
| 317 |
key = None,
|
| 318 |
):
|
| 319 |
+
|
| 320 |
from infinity_emb import EngineArgs
|
| 321 |
from infinity_emb.engine import AsyncEngineArray
|
| 322 |
+
|
| 323 |
self._default_model = model_names[0]
|
| 324 |
+
self.engine_array = AsyncEngineArray.from_args([EngineArgs(model_name_or_path = model_name, **engine_kwargs) for model_name in model_names])
|
| 325 |
|
| 326 |
async def _embed(self, sentences: list[str], model_name: str = ""):
|
| 327 |
if not model_name:
|