Replaced md5 with xxhash64 for chunk id (#4009)
Browse files### What problem does this PR solve?
Replaced md5 with xxhash64 for chunk id
### Type of change
- [x] Refactoring
- api/apps/chunk_app.py +2 -4
- api/apps/sdk/doc.py +2 -5
- api/db/services/document_service.py +2 -5
- api/db/services/task_service.py +8 -12
- rag/svr/task_executor.py +3 -8
api/apps/chunk_app.py
CHANGED
|
@@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
|
|
| 31 |
from api.db.services.document_service import DocumentService
|
| 32 |
from api import settings
|
| 33 |
from api.utils.api_utils import get_json_result
|
| 34 |
-
import
|
| 35 |
import re
|
| 36 |
|
| 37 |
|
|
@@ -208,9 +208,7 @@ def rm():
|
|
| 208 |
@validate_request("doc_id", "content_with_weight")
|
| 209 |
def create():
|
| 210 |
req = request.json
|
| 211 |
-
|
| 212 |
-
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
| 213 |
-
chunck_id = md5.hexdigest()
|
| 214 |
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
| 215 |
"content_with_weight": req["content_with_weight"]}
|
| 216 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
|
|
|
| 31 |
from api.db.services.document_service import DocumentService
|
| 32 |
from api import settings
|
| 33 |
from api.utils.api_utils import get_json_result
|
| 34 |
+
import xxhash
|
| 35 |
import re
|
| 36 |
|
| 37 |
|
|
|
|
| 208 |
@validate_request("doc_id", "content_with_weight")
|
| 209 |
def create():
|
| 210 |
req = request.json
|
| 211 |
+
chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
| 212 |
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
| 213 |
"content_with_weight": req["content_with_weight"]}
|
| 214 |
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
api/apps/sdk/doc.py
CHANGED
|
@@ -22,7 +22,7 @@ from rag.nlp import rag_tokenizer
|
|
| 22 |
from api.db import LLMType, ParserType
|
| 23 |
from api.db.services.llm_service import TenantLLMService
|
| 24 |
from api import settings
|
| 25 |
-
import
|
| 26 |
import re
|
| 27 |
from api.utils.api_utils import token_required
|
| 28 |
from api.db.db_models import Task
|
|
@@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
|
| 984 |
return get_error_data_result(
|
| 985 |
"`questions` is required to be a list"
|
| 986 |
)
|
| 987 |
-
|
| 988 |
-
md5.update((req["content"] + document_id).encode("utf-8"))
|
| 989 |
-
|
| 990 |
-
chunk_id = md5.hexdigest()
|
| 991 |
d = {
|
| 992 |
"id": chunk_id,
|
| 993 |
"content_ltks": rag_tokenizer.tokenize(req["content"]),
|
|
|
|
| 22 |
from api.db import LLMType, ParserType
|
| 23 |
from api.db.services.llm_service import TenantLLMService
|
| 24 |
from api import settings
|
| 25 |
+
import xxhash
|
| 26 |
import re
|
| 27 |
from api.utils.api_utils import token_required
|
| 28 |
from api.db.db_models import Task
|
|
|
|
| 984 |
return get_error_data_result(
|
| 985 |
"`questions` is required to be a list"
|
| 986 |
)
|
| 987 |
+
chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
|
|
|
| 988 |
d = {
|
| 989 |
"id": chunk_id,
|
| 990 |
"content_ltks": rag_tokenizer.tokenize(req["content"]),
|
api/db/services/document_service.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
import logging
|
| 17 |
-
import
|
| 18 |
import json
|
| 19 |
import random
|
| 20 |
import re
|
|
@@ -508,10 +508,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|
| 508 |
for ck in th.result():
|
| 509 |
d = deepcopy(doc)
|
| 510 |
d.update(ck)
|
| 511 |
-
|
| 512 |
-
md5.update((ck["content_with_weight"] +
|
| 513 |
-
str(d["doc_id"])).encode("utf-8"))
|
| 514 |
-
d["id"] = md5.hexdigest()
|
| 515 |
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 516 |
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 517 |
if not d.get("image"):
|
|
|
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
import logging
|
| 17 |
+
import xxhash
|
| 18 |
import json
|
| 19 |
import random
|
| 20 |
import re
|
|
|
|
| 508 |
for ck in th.result():
|
| 509 |
d = deepcopy(doc)
|
| 510 |
d.update(ck)
|
| 511 |
+
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
|
|
|
| 512 |
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 513 |
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 514 |
if not d.get("image"):
|
api/db/services/task_service.py
CHANGED
|
@@ -35,17 +35,13 @@ from api import settings
|
|
| 35 |
from rag.nlp import search
|
| 36 |
|
| 37 |
def trim_header_by_lines(text: str, max_length) -> str:
|
| 38 |
-
|
|
|
|
| 39 |
return text
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
if total + len(lines[i]) > max_length:
|
| 45 |
-
break
|
| 46 |
-
idx = i
|
| 47 |
-
text2 = "\n".join(lines[idx:])
|
| 48 |
-
return text2
|
| 49 |
|
| 50 |
class TaskService(CommonService):
|
| 51 |
model = Task
|
|
@@ -183,7 +179,7 @@ class TaskService(CommonService):
|
|
| 183 |
if os.environ.get("MACOS"):
|
| 184 |
if info["progress_msg"]:
|
| 185 |
task = cls.model.get_by_id(id)
|
| 186 |
-
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"],
|
| 187 |
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
| 188 |
if "progress" in info:
|
| 189 |
cls.model.update(progress=info["progress"]).where(
|
|
@@ -194,7 +190,7 @@ class TaskService(CommonService):
|
|
| 194 |
with DB.lock("update_progress", -1):
|
| 195 |
if info["progress_msg"]:
|
| 196 |
task = cls.model.get_by_id(id)
|
| 197 |
-
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"],
|
| 198 |
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
| 199 |
if "progress" in info:
|
| 200 |
cls.model.update(progress=info["progress"]).where(
|
|
|
|
| 35 |
from rag.nlp import search
|
| 36 |
|
| 37 |
def trim_header_by_lines(text: str, max_length) -> str:
|
| 38 |
+
len_text = len(text)
|
| 39 |
+
if len_text <= max_length:
|
| 40 |
return text
|
| 41 |
+
for i in range(len_text):
|
| 42 |
+
if text[i] == '\n' and len_text - i <= max_length:
|
| 43 |
+
return text[i+1:]
|
| 44 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
class TaskService(CommonService):
|
| 47 |
model = Task
|
|
|
|
| 179 |
if os.environ.get("MACOS"):
|
| 180 |
if info["progress_msg"]:
|
| 181 |
task = cls.model.get_by_id(id)
|
| 182 |
+
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
|
| 183 |
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
| 184 |
if "progress" in info:
|
| 185 |
cls.model.update(progress=info["progress"]).where(
|
|
|
|
| 190 |
with DB.lock("update_progress", -1):
|
| 191 |
if info["progress_msg"]:
|
| 192 |
task = cls.model.get_by_id(id)
|
| 193 |
+
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
|
| 194 |
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
|
| 195 |
if "progress" in info:
|
| 196 |
cls.model.update(progress=info["progress"]).where(
|
rag/svr/task_executor.py
CHANGED
|
@@ -27,7 +27,7 @@ import logging
|
|
| 27 |
import os
|
| 28 |
from datetime import datetime
|
| 29 |
import json
|
| 30 |
-
import
|
| 31 |
import copy
|
| 32 |
import re
|
| 33 |
import time
|
|
@@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
|
|
| 226 |
for ck in cks:
|
| 227 |
d = copy.deepcopy(doc)
|
| 228 |
d.update(ck)
|
| 229 |
-
|
| 230 |
-
md5.update((ck["content_with_weight"] +
|
| 231 |
-
str(d["doc_id"])).encode("utf-8"))
|
| 232 |
-
d["id"] = md5.hexdigest()
|
| 233 |
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 234 |
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 235 |
if not d.get("image"):
|
|
@@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
|
|
| 368 |
tk_count = 0
|
| 369 |
for content, vctr in chunks[original_length:]:
|
| 370 |
d = copy.deepcopy(doc)
|
| 371 |
-
|
| 372 |
-
md5.update((content + str(d["doc_id"])).encode("utf-8"))
|
| 373 |
-
d["id"] = md5.hexdigest()
|
| 374 |
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 375 |
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 376 |
d[vctr_nm] = vctr.tolist()
|
|
|
|
| 27 |
import os
|
| 28 |
from datetime import datetime
|
| 29 |
import json
|
| 30 |
+
import xxhash
|
| 31 |
import copy
|
| 32 |
import re
|
| 33 |
import time
|
|
|
|
| 226 |
for ck in cks:
|
| 227 |
d = copy.deepcopy(doc)
|
| 228 |
d.update(ck)
|
| 229 |
+
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
|
|
|
| 230 |
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 231 |
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 232 |
if not d.get("image"):
|
|
|
|
| 365 |
tk_count = 0
|
| 366 |
for content, vctr in chunks[original_length:]:
|
| 367 |
d = copy.deepcopy(doc)
|
| 368 |
+
d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
|
|
|
|
|
|
|
| 369 |
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
| 370 |
d["create_timestamp_flt"] = datetime.now().timestamp()
|
| 371 |
d[vctr_nm] = vctr.tolist()
|