Kevin Hu
commited on
Commit
·
0c54322
1
Parent(s):
5607916
Make infinity adapt to condition `exist`. (#4657)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- api/apps/kb_app.py +9 -1
- rag/raptor.py +3 -1
- rag/utils/es_conn.py +1 -1
- rag/utils/infinity_conn.py +56 -12
api/apps/kb_app.py
CHANGED
|
@@ -24,6 +24,7 @@ from api.db.services.document_service import DocumentService
|
|
| 24 |
from api.db.services.file2document_service import File2DocumentService
|
| 25 |
from api.db.services.file_service import FileService
|
| 26 |
from api.db.services.user_service import TenantService, UserTenantService
|
|
|
|
| 27 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request, not_allowed_parameters
|
| 28 |
from api.utils import get_uuid
|
| 29 |
from api.db import StatusEnum, FileSource
|
|
@@ -96,6 +97,13 @@ def update():
|
|
| 96 |
return get_data_error_result(
|
| 97 |
message="Can't find this knowledgebase!")
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
if req["name"].lower() != kb.name.lower() \
|
| 100 |
and len(
|
| 101 |
KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) > 1:
|
|
@@ -112,7 +120,7 @@ def update():
|
|
| 112 |
search.index_name(kb.tenant_id), kb.id)
|
| 113 |
else:
|
| 114 |
# Elasticsearch requires PAGERANK_FLD be non-zero!
|
| 115 |
-
settings.docStoreConn.update({"
|
| 116 |
search.index_name(kb.tenant_id), kb.id)
|
| 117 |
|
| 118 |
e, kb = KnowledgebaseService.get_by_id(kb.id)
|
|
|
|
| 24 |
from api.db.services.file2document_service import File2DocumentService
|
| 25 |
from api.db.services.file_service import FileService
|
| 26 |
from api.db.services.user_service import TenantService, UserTenantService
|
| 27 |
+
from api.settings import DOC_ENGINE
|
| 28 |
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request, not_allowed_parameters
|
| 29 |
from api.utils import get_uuid
|
| 30 |
from api.db import StatusEnum, FileSource
|
|
|
|
| 97 |
return get_data_error_result(
|
| 98 |
message="Can't find this knowledgebase!")
|
| 99 |
|
| 100 |
+
if req.get("parser_id", "") == "tag" and DOC_ENGINE == "infinity":
|
| 101 |
+
return get_json_result(
|
| 102 |
+
data=False,
|
| 103 |
+
message='The chunk method Tag has not been supported by Infinity yet.',
|
| 104 |
+
code=settings.RetCode.OPERATING_ERROR
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
if req["name"].lower() != kb.name.lower() \
|
| 108 |
and len(
|
| 109 |
KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) > 1:
|
|
|
|
| 120 |
search.index_name(kb.tenant_id), kb.id)
|
| 121 |
else:
|
| 122 |
# Elasticsearch requires PAGERANK_FLD be non-zero!
|
| 123 |
+
settings.docStoreConn.update({"exists": PAGERANK_FLD}, {"remove": PAGERANK_FLD},
|
| 124 |
search.index_name(kb.tenant_id), kb.id)
|
| 125 |
|
| 126 |
e, kb = KnowledgebaseService.get_by_id(kb.id)
|
rag/raptor.py
CHANGED
|
@@ -71,7 +71,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
|
| 71 |
start, end = 0, len(chunks)
|
| 72 |
if len(chunks) <= 1:
|
| 73 |
return
|
| 74 |
-
chunks = [(s, a) for s, a in chunks if len(a) > 0]
|
| 75 |
|
| 76 |
def summarize(ck_idx, lock):
|
| 77 |
nonlocal chunks
|
|
@@ -125,6 +125,8 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
|
| 125 |
threads = []
|
| 126 |
for c in range(n_clusters):
|
| 127 |
ck_idx = [i + start for i in range(len(lbls)) if lbls[i] == c]
|
|
|
|
|
|
|
| 128 |
threads.append(executor.submit(summarize, ck_idx, lock))
|
| 129 |
wait(threads, return_when=ALL_COMPLETED)
|
| 130 |
for th in threads:
|
|
|
|
| 71 |
start, end = 0, len(chunks)
|
| 72 |
if len(chunks) <= 1:
|
| 73 |
return
|
| 74 |
+
chunks = [(s, a) for s, a in chunks if s and len(a) > 0]
|
| 75 |
|
| 76 |
def summarize(ck_idx, lock):
|
| 77 |
nonlocal chunks
|
|
|
|
| 125 |
threads = []
|
| 126 |
for c in range(n_clusters):
|
| 127 |
ck_idx = [i + start for i in range(len(lbls)) if lbls[i] == c]
|
| 128 |
+
if not ck_idx:
|
| 129 |
+
continue
|
| 130 |
threads.append(executor.submit(summarize, ck_idx, lock))
|
| 131 |
wait(threads, return_when=ALL_COMPLETED)
|
| 132 |
for th in threads:
|
rag/utils/es_conn.py
CHANGED
|
@@ -336,7 +336,7 @@ class ESConnection(DocStoreConnection):
|
|
| 336 |
for k, v in condition.items():
|
| 337 |
if not isinstance(k, str) or not v:
|
| 338 |
continue
|
| 339 |
-
if k == "
|
| 340 |
bqry.filter.append(Q("exists", field=v))
|
| 341 |
continue
|
| 342 |
if isinstance(v, list):
|
|
|
|
| 336 |
for k, v in condition.items():
|
| 337 |
if not isinstance(k, str) or not v:
|
| 338 |
continue
|
| 339 |
+
if k == "exists":
|
| 340 |
bqry.filter.append(Q("exists", field=v))
|
| 341 |
continue
|
| 342 |
if isinstance(v, list):
|
rag/utils/infinity_conn.py
CHANGED
|
@@ -44,8 +44,23 @@ from rag.utils.doc_store_conn import (
|
|
| 44 |
logger = logging.getLogger('ragflow.infinity_conn')
|
| 45 |
|
| 46 |
|
| 47 |
-
def equivalent_condition_to_str(condition: dict) -> str | None:
|
| 48 |
assert "_id" not in condition
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
cond = list()
|
| 50 |
for k, v in condition.items():
|
| 51 |
if not isinstance(k, str) or k in ["kb_id"] or not v:
|
|
@@ -61,8 +76,15 @@ def equivalent_condition_to_str(condition: dict) -> str | None:
|
|
| 61 |
strInCond = ", ".join(inCond)
|
| 62 |
strInCond = f"{k} IN ({strInCond})"
|
| 63 |
cond.append(strInCond)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
elif isinstance(v, str):
|
| 65 |
cond.append(f"{k}='{v}'")
|
|
|
|
|
|
|
| 66 |
else:
|
| 67 |
cond.append(f"{k}={str(v)}")
|
| 68 |
return " AND ".join(cond) if cond else "1=1"
|
|
@@ -294,7 +316,11 @@ class InfinityConnection(DocStoreConnection):
|
|
| 294 |
filter_cond = None
|
| 295 |
filter_fulltext = ""
|
| 296 |
if condition:
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
for matchExpr in matchExprs:
|
| 299 |
if isinstance(matchExpr, MatchTextExpr):
|
| 300 |
if filter_cond and "filter" not in matchExpr.extra_options:
|
|
@@ -434,12 +460,21 @@ class InfinityConnection(DocStoreConnection):
|
|
| 434 |
self.createIdx(indexName, knowledgebaseId, vector_size)
|
| 435 |
table_instance = db_instance.get_table(table_name)
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
docs = copy.deepcopy(documents)
|
| 438 |
for d in docs:
|
| 439 |
assert "_id" not in d
|
| 440 |
assert "id" in d
|
| 441 |
for k, v in d.items():
|
| 442 |
-
if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd"]:
|
| 443 |
assert isinstance(v, list)
|
| 444 |
d[k] = "###".join(v)
|
| 445 |
elif re.search(r"_feas$", k):
|
|
@@ -454,6 +489,11 @@ class InfinityConnection(DocStoreConnection):
|
|
| 454 |
elif k in ["page_num_int", "top_int"]:
|
| 455 |
assert isinstance(v, list)
|
| 456 |
d[k] = "_".join(f"{num:08x}" for num in v)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
ids = ["'{}'".format(d["id"]) for d in docs]
|
| 458 |
str_ids = ", ".join(ids)
|
| 459 |
str_filter = f"id IN ({str_ids})"
|
|
@@ -475,11 +515,11 @@ class InfinityConnection(DocStoreConnection):
|
|
| 475 |
db_instance = inf_conn.get_database(self.dbName)
|
| 476 |
table_name = f"{indexName}_{knowledgebaseId}"
|
| 477 |
table_instance = db_instance.get_table(table_name)
|
| 478 |
-
if "
|
| 479 |
-
|
| 480 |
-
filter = equivalent_condition_to_str(condition)
|
| 481 |
for k, v in list(newValue.items()):
|
| 482 |
-
if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd"]:
|
| 483 |
assert isinstance(v, list)
|
| 484 |
newValue[k] = "###".join(v)
|
| 485 |
elif re.search(r"_feas$", k):
|
|
@@ -496,9 +536,11 @@ class InfinityConnection(DocStoreConnection):
|
|
| 496 |
elif k in ["page_num_int", "top_int"]:
|
| 497 |
assert isinstance(v, list)
|
| 498 |
newValue[k] = "_".join(f"{num:08x}" for num in v)
|
| 499 |
-
elif k == "remove"
|
| 500 |
del newValue[k]
|
| 501 |
-
|
|
|
|
|
|
|
| 502 |
logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
|
| 503 |
table_instance.update(filter, newValue)
|
| 504 |
self.connPool.release_conn(inf_conn)
|
|
@@ -508,14 +550,14 @@ class InfinityConnection(DocStoreConnection):
|
|
| 508 |
inf_conn = self.connPool.get_conn()
|
| 509 |
db_instance = inf_conn.get_database(self.dbName)
|
| 510 |
table_name = f"{indexName}_{knowledgebaseId}"
|
| 511 |
-
filter = equivalent_condition_to_str(condition)
|
| 512 |
try:
|
| 513 |
table_instance = db_instance.get_table(table_name)
|
| 514 |
except Exception:
|
| 515 |
logger.warning(
|
| 516 |
-
f"Skipped deleting
|
| 517 |
)
|
| 518 |
return 0
|
|
|
|
| 519 |
logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
|
| 520 |
res = table_instance.delete(filter)
|
| 521 |
self.connPool.release_conn(inf_conn)
|
|
@@ -553,7 +595,7 @@ class InfinityConnection(DocStoreConnection):
|
|
| 553 |
v = res[fieldnm][i]
|
| 554 |
if isinstance(v, Series):
|
| 555 |
v = list(v)
|
| 556 |
-
elif fieldnm in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd"]:
|
| 557 |
assert isinstance(v, str)
|
| 558 |
v = [kwd for kwd in v.split("###") if kwd]
|
| 559 |
elif fieldnm == "position_int":
|
|
@@ -584,6 +626,8 @@ class InfinityConnection(DocStoreConnection):
|
|
| 584 |
ans = {}
|
| 585 |
num_rows = len(res)
|
| 586 |
column_id = res["id"]
|
|
|
|
|
|
|
| 587 |
for i in range(num_rows):
|
| 588 |
id = column_id[i]
|
| 589 |
txt = res[fieldnm][i]
|
|
|
|
| 44 |
logger = logging.getLogger('ragflow.infinity_conn')
|
| 45 |
|
| 46 |
|
| 47 |
+
def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
|
| 48 |
assert "_id" not in condition
|
| 49 |
+
clmns = {}
|
| 50 |
+
if table_instance:
|
| 51 |
+
for n, ty, de, _ in table_instance.show_columns().rows():
|
| 52 |
+
clmns[n] = (ty, de)
|
| 53 |
+
|
| 54 |
+
def exists(cln):
|
| 55 |
+
nonlocal clmns
|
| 56 |
+
assert cln in clmns, f"'{cln}' should be in '{clmns}'."
|
| 57 |
+
ty, de = clmns[cln]
|
| 58 |
+
if ty.lower().find("cha"):
|
| 59 |
+
if not de:
|
| 60 |
+
de = ""
|
| 61 |
+
return f" {cln}!='{de}' "
|
| 62 |
+
return f"{cln}!={de}"
|
| 63 |
+
|
| 64 |
cond = list()
|
| 65 |
for k, v in condition.items():
|
| 66 |
if not isinstance(k, str) or k in ["kb_id"] or not v:
|
|
|
|
| 76 |
strInCond = ", ".join(inCond)
|
| 77 |
strInCond = f"{k} IN ({strInCond})"
|
| 78 |
cond.append(strInCond)
|
| 79 |
+
elif k == "must_not":
|
| 80 |
+
if isinstance(v, dict):
|
| 81 |
+
for kk, vv in v.items():
|
| 82 |
+
if kk == "exists":
|
| 83 |
+
cond.append("NOT (%s)" % exists(vv))
|
| 84 |
elif isinstance(v, str):
|
| 85 |
cond.append(f"{k}='{v}'")
|
| 86 |
+
elif k == "exists":
|
| 87 |
+
cond.append(exists(v))
|
| 88 |
else:
|
| 89 |
cond.append(f"{k}={str(v)}")
|
| 90 |
return " AND ".join(cond) if cond else "1=1"
|
|
|
|
| 316 |
filter_cond = None
|
| 317 |
filter_fulltext = ""
|
| 318 |
if condition:
|
| 319 |
+
for indexName in indexNames:
|
| 320 |
+
table_name = f"{indexName}_{knowledgebaseIds[0]}"
|
| 321 |
+
filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
|
| 322 |
+
break
|
| 323 |
+
|
| 324 |
for matchExpr in matchExprs:
|
| 325 |
if isinstance(matchExpr, MatchTextExpr):
|
| 326 |
if filter_cond and "filter" not in matchExpr.extra_options:
|
|
|
|
| 460 |
self.createIdx(indexName, knowledgebaseId, vector_size)
|
| 461 |
table_instance = db_instance.get_table(table_name)
|
| 462 |
|
| 463 |
+
# embedding fields can't have a default value....
|
| 464 |
+
embedding_clmns = []
|
| 465 |
+
clmns = table_instance.show_columns().rows()
|
| 466 |
+
for n, ty, _, _ in clmns:
|
| 467 |
+
r = re.search(r"Embedding\([a-z]+,([0-9]+)\)", ty)
|
| 468 |
+
if not r:
|
| 469 |
+
continue
|
| 470 |
+
embedding_clmns.append((n, int(r.group(1))))
|
| 471 |
+
|
| 472 |
docs = copy.deepcopy(documents)
|
| 473 |
for d in docs:
|
| 474 |
assert "_id" not in d
|
| 475 |
assert "id" in d
|
| 476 |
for k, v in d.items():
|
| 477 |
+
if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd", "source_id"]:
|
| 478 |
assert isinstance(v, list)
|
| 479 |
d[k] = "###".join(v)
|
| 480 |
elif re.search(r"_feas$", k):
|
|
|
|
| 489 |
elif k in ["page_num_int", "top_int"]:
|
| 490 |
assert isinstance(v, list)
|
| 491 |
d[k] = "_".join(f"{num:08x}" for num in v)
|
| 492 |
+
|
| 493 |
+
for n, vs in embedding_clmns:
|
| 494 |
+
if n in d:
|
| 495 |
+
continue
|
| 496 |
+
d[n] = [0] * vs
|
| 497 |
ids = ["'{}'".format(d["id"]) for d in docs]
|
| 498 |
str_ids = ", ".join(ids)
|
| 499 |
str_filter = f"id IN ({str_ids})"
|
|
|
|
| 515 |
db_instance = inf_conn.get_database(self.dbName)
|
| 516 |
table_name = f"{indexName}_{knowledgebaseId}"
|
| 517 |
table_instance = db_instance.get_table(table_name)
|
| 518 |
+
#if "exists" in condition:
|
| 519 |
+
# del condition["exists"]
|
| 520 |
+
filter = equivalent_condition_to_str(condition, table_instance)
|
| 521 |
for k, v in list(newValue.items()):
|
| 522 |
+
if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd", "source_id"]:
|
| 523 |
assert isinstance(v, list)
|
| 524 |
newValue[k] = "###".join(v)
|
| 525 |
elif re.search(r"_feas$", k):
|
|
|
|
| 536 |
elif k in ["page_num_int", "top_int"]:
|
| 537 |
assert isinstance(v, list)
|
| 538 |
newValue[k] = "_".join(f"{num:08x}" for num in v)
|
| 539 |
+
elif k == "remove":
|
| 540 |
del newValue[k]
|
| 541 |
+
if v in [PAGERANK_FLD]:
|
| 542 |
+
newValue[v] = 0
|
| 543 |
+
|
| 544 |
logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
|
| 545 |
table_instance.update(filter, newValue)
|
| 546 |
self.connPool.release_conn(inf_conn)
|
|
|
|
| 550 |
inf_conn = self.connPool.get_conn()
|
| 551 |
db_instance = inf_conn.get_database(self.dbName)
|
| 552 |
table_name = f"{indexName}_{knowledgebaseId}"
|
|
|
|
| 553 |
try:
|
| 554 |
table_instance = db_instance.get_table(table_name)
|
| 555 |
except Exception:
|
| 556 |
logger.warning(
|
| 557 |
+
f"Skipped deleting from table {table_name} since the table doesn't exist."
|
| 558 |
)
|
| 559 |
return 0
|
| 560 |
+
filter = equivalent_condition_to_str(condition, table_instance)
|
| 561 |
logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
|
| 562 |
res = table_instance.delete(filter)
|
| 563 |
self.connPool.release_conn(inf_conn)
|
|
|
|
| 595 |
v = res[fieldnm][i]
|
| 596 |
if isinstance(v, Series):
|
| 597 |
v = list(v)
|
| 598 |
+
elif fieldnm in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd", "source_id"]:
|
| 599 |
assert isinstance(v, str)
|
| 600 |
v = [kwd for kwd in v.split("###") if kwd]
|
| 601 |
elif fieldnm == "position_int":
|
|
|
|
| 626 |
ans = {}
|
| 627 |
num_rows = len(res)
|
| 628 |
column_id = res["id"]
|
| 629 |
+
if fieldnm not in res:
|
| 630 |
+
return {}
|
| 631 |
for i in range(num_rows):
|
| 632 |
id = column_id[i]
|
| 633 |
txt = res[fieldnm][i]
|