Edit chunk shall update instead of insert it (#3709)
Browse files### What problem does this PR solve?
Edit chunk shall update instead of insert it. Close #3679
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- api/apps/chunk_app.py +1 -1
- api/apps/kb_app.py +3 -1
- api/apps/user_app.py +1 -1
- deepdoc/parser/docx_parser.py +1 -1
- deepdoc/parser/pdf_parser.py +5 -5
- deepdoc/parser/resume/entities/corporations.py +1 -1
- deepdoc/parser/resume/entities/schools.py +1 -1
- deepdoc/parser/resume/step_one.py +2 -2
- deepdoc/parser/resume/step_two.py +3 -3
- deepdoc/vision/table_structure_recognizer.py +1 -1
- rag/app/paper.py +2 -2
- rag/app/picture.py +1 -1
- rag/nlp/__init__.py +3 -3
- rag/nlp/query.py +5 -5
- rag/nlp/rag_tokenizer.py +3 -3
- rag/nlp/search.py +9 -9
- rag/nlp/term_weight.py +4 -4
- rag/utils/es_conn.py +4 -1
- rag/utils/infinity_conn.py +1 -1
api/apps/chunk_app.py
CHANGED
@@ -155,7 +155,7 @@ def set():
|
|
155 |
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
156 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
157 |
d["q_%d_vec" % len(v)] = v.tolist()
|
158 |
-
settings.docStoreConn.
|
159 |
return get_json_result(data=True)
|
160 |
except Exception as e:
|
161 |
return server_error_response(e)
|
|
|
155 |
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
156 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
157 |
d["q_%d_vec" % len(v)] = v.tolist()
|
158 |
+
settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
|
159 |
return get_json_result(data=True)
|
160 |
except Exception as e:
|
161 |
return server_error_response(e)
|
api/apps/kb_app.py
CHANGED
@@ -168,7 +168,9 @@ def rm():
|
|
168 |
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
|
169 |
return get_data_error_result(
|
170 |
message="Database error (Knowledgebase removal)!")
|
171 |
-
|
|
|
|
|
172 |
return get_json_result(data=True)
|
173 |
except Exception as e:
|
174 |
return server_error_response(e)
|
|
|
168 |
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
|
169 |
return get_data_error_result(
|
170 |
message="Database error (Knowledgebase removal)!")
|
171 |
+
for kb in kbs:
|
172 |
+
settings.docStoreConn.delete({"kb_id": kb.id}, search.index_name(kb.tenant_id), kb.id)
|
173 |
+
settings.docStoreConn.deleteIdx(search.index_name(kb.tenant_id), kb.id)
|
174 |
return get_json_result(data=True)
|
175 |
except Exception as e:
|
176 |
return server_error_response(e)
|
api/apps/user_app.py
CHANGED
@@ -252,7 +252,7 @@ def feishu_callback():
|
|
252 |
if res["code"] != 0:
|
253 |
return redirect("/?error=%s" % res["message"])
|
254 |
|
255 |
-
if "contact:user.email:readonly" not in res["data"]["scope"].split(
|
256 |
return redirect("/?error=contact:user.email:readonly not in scope")
|
257 |
session["access_token"] = res["data"]["access_token"]
|
258 |
session["access_token_from"] = "feishu"
|
|
|
252 |
if res["code"] != 0:
|
253 |
return redirect("/?error=%s" % res["message"])
|
254 |
|
255 |
+
if "contact:user.email:readonly" not in res["data"]["scope"].split():
|
256 |
return redirect("/?error=contact:user.email:readonly not in scope")
|
257 |
session["access_token"] = res["data"]["access_token"]
|
258 |
session["access_token_from"] = "feishu"
|
deepdoc/parser/docx_parser.py
CHANGED
@@ -47,7 +47,7 @@ class RAGFlowDocxParser:
|
|
47 |
for p, n in patt:
|
48 |
if re.search(p, b):
|
49 |
return n
|
50 |
-
tks = [t for t in rag_tokenizer.tokenize(b).split(
|
51 |
if len(tks) > 3:
|
52 |
if len(tks) < 12:
|
53 |
return "Tx"
|
|
|
47 |
for p, n in patt:
|
48 |
if re.search(p, b):
|
49 |
return n
|
50 |
+
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
51 |
if len(tks) > 3:
|
52 |
if len(tks) < 12:
|
53 |
return "Tx"
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -108,13 +108,13 @@ class RAGFlowPdfParser:
|
|
108 |
h = max(self.__height(up), self.__height(down))
|
109 |
y_dis = self._y_dis(up, down)
|
110 |
LEN = 6
|
111 |
-
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(
|
112 |
-
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(
|
113 |
tks_all = up["text"][-LEN:].strip() \
|
114 |
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
115 |
up["text"][-1] + down["text"][0]) else "") \
|
116 |
+ down["text"][:LEN].strip()
|
117 |
-
tks_all = rag_tokenizer.tokenize(tks_all).split(
|
118 |
fea = [
|
119 |
up.get("R", -1) == down.get("R", -1),
|
120 |
y_dis / h,
|
@@ -565,13 +565,13 @@ class RAGFlowPdfParser:
|
|
565 |
if i >= len(self.boxes):
|
566 |
break
|
567 |
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
|
568 |
-
self.boxes[i]["text"].strip().split(
|
569 |
while not prefix:
|
570 |
self.boxes.pop(i)
|
571 |
if i >= len(self.boxes):
|
572 |
break
|
573 |
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
|
574 |
-
self.boxes[i]["text"].strip().split(
|
575 |
self.boxes.pop(i)
|
576 |
if i >= len(self.boxes) or not prefix:
|
577 |
break
|
|
|
108 |
h = max(self.__height(up), self.__height(down))
|
109 |
y_dis = self._y_dis(up, down)
|
110 |
LEN = 6
|
111 |
+
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
|
112 |
+
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
|
113 |
tks_all = up["text"][-LEN:].strip() \
|
114 |
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
115 |
up["text"][-1] + down["text"][0]) else "") \
|
116 |
+ down["text"][:LEN].strip()
|
117 |
+
tks_all = rag_tokenizer.tokenize(tks_all).split()
|
118 |
fea = [
|
119 |
up.get("R", -1) == down.get("R", -1),
|
120 |
y_dis / h,
|
|
|
565 |
if i >= len(self.boxes):
|
566 |
break
|
567 |
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
|
568 |
+
self.boxes[i]["text"].strip().split()[:2])
|
569 |
while not prefix:
|
570 |
self.boxes.pop(i)
|
571 |
if i >= len(self.boxes):
|
572 |
break
|
573 |
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
|
574 |
+
self.boxes[i]["text"].strip().split()[:2])
|
575 |
self.boxes.pop(i)
|
576 |
if i >= len(self.boxes) or not prefix:
|
577 |
break
|
deepdoc/parser/resume/entities/corporations.py
CHANGED
@@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True):
|
|
47 |
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
48 |
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
49 |
|
50 |
-
tks = rag_tokenizer.tokenize(nm).split(
|
51 |
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
52 |
nm = ""
|
53 |
for t in tks:
|
|
|
47 |
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
48 |
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
49 |
|
50 |
+
tks = rag_tokenizer.tokenize(nm).split()
|
51 |
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
52 |
nm = ""
|
53 |
for t in tks:
|
deepdoc/parser/resume/entities/schools.py
CHANGED
@@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
|
|
44 |
|
45 |
def split(txt):
|
46 |
tks = []
|
47 |
-
for t in re.sub(r"[ \t]+", " ",txt).split(
|
48 |
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
49 |
re.match(r"[a-zA-Z]", t) and tks:
|
50 |
tks[-1] = tks[-1] + " " + t
|
|
|
44 |
|
45 |
def split(txt):
|
46 |
tks = []
|
47 |
+
for t in re.sub(r"[ \t]+", " ",txt).split():
|
48 |
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
49 |
re.match(r"[a-zA-Z]", t) and tks:
|
50 |
tks[-1] = tks[-1] + " " + t
|
deepdoc/parser/resume/step_one.py
CHANGED
@@ -80,7 +80,7 @@ def refactor(df):
|
|
80 |
def loadjson(line):
|
81 |
try:
|
82 |
return json.loads(line)
|
83 |
-
except Exception
|
84 |
pass
|
85 |
return {}
|
86 |
|
@@ -183,4 +183,4 @@ def refactor(df):
|
|
183 |
"\r",
|
184 |
"\\n"))
|
185 |
# print(df.values.tolist())
|
186 |
-
return dict(zip([n.split(
|
|
|
80 |
def loadjson(line):
|
81 |
try:
|
82 |
return json.loads(line)
|
83 |
+
except Exception:
|
84 |
pass
|
85 |
return {}
|
86 |
|
|
|
183 |
"\r",
|
184 |
"\\n"))
|
185 |
# print(df.values.tolist())
|
186 |
+
return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
|
deepdoc/parser/resume/step_two.py
CHANGED
@@ -100,7 +100,7 @@ def forEdu(cv):
|
|
100 |
if n.get("school_name") and isinstance(n["school_name"], str):
|
101 |
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
102 |
e["sch_nm_kwd"] = sch[-1]
|
103 |
-
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(
|
104 |
|
105 |
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
106 |
maj.append(n["discipline_name"])
|
@@ -485,7 +485,7 @@ def parse(cv):
|
|
485 |
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
486 |
nm = re.sub(r"[ \t ]+", " ", nm)
|
487 |
if re.match(r"[a-zA-Z ]+$", nm):
|
488 |
-
if len(nm.split(
|
489 |
cv["name"] = nm
|
490 |
else:
|
491 |
nm = ""
|
@@ -503,7 +503,7 @@ def parse(cv):
|
|
503 |
for py in PY.get_pinyins(nm[:20], ''):
|
504 |
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
|
505 |
for py in PY.get_pinyins(nm[:20], ' '):
|
506 |
-
py = py.split(
|
507 |
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
508 |
|
509 |
cv["name_kwd"] = name
|
|
|
100 |
if n.get("school_name") and isinstance(n["school_name"], str):
|
101 |
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
102 |
e["sch_nm_kwd"] = sch[-1]
|
103 |
+
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
|
104 |
|
105 |
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
106 |
maj.append(n["discipline_name"])
|
|
|
485 |
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
486 |
nm = re.sub(r"[ \t ]+", " ", nm)
|
487 |
if re.match(r"[a-zA-Z ]+$", nm):
|
488 |
+
if len(nm.split()) > 1:
|
489 |
cv["name"] = nm
|
490 |
else:
|
491 |
nm = ""
|
|
|
503 |
for py in PY.get_pinyins(nm[:20], ''):
|
504 |
for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
|
505 |
for py in PY.get_pinyins(nm[:20], ' '):
|
506 |
+
py = py.split()
|
507 |
for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
508 |
|
509 |
cv["name_kwd"] = name
|
deepdoc/vision/table_structure_recognizer.py
CHANGED
@@ -117,7 +117,7 @@ class TableStructureRecognizer(Recognizer):
|
|
117 |
for p, n in patt:
|
118 |
if re.search(p, b["text"].strip()):
|
119 |
return n
|
120 |
-
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(
|
121 |
if len(tks) > 3:
|
122 |
if len(tks) < 12:
|
123 |
return "Tx"
|
|
|
117 |
for p, n in patt:
|
118 |
if re.search(p, b["text"].strip()):
|
119 |
return n
|
120 |
+
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
|
121 |
if len(tks) > 3:
|
122 |
if len(tks) < 12:
|
123 |
return "Tx"
|
rag/app/paper.py
CHANGED
@@ -99,11 +99,11 @@ class Pdf(PdfParser):
|
|
99 |
i += 1
|
100 |
txt = b["text"].lower().strip()
|
101 |
if re.match("(abstract|摘要)", txt):
|
102 |
-
if len(txt.split(
|
103 |
abstr = txt + self._line_tag(b, zoomin)
|
104 |
break
|
105 |
txt = self.boxes[i]["text"].lower().strip()
|
106 |
-
if len(txt.split(
|
107 |
abstr = txt + self._line_tag(self.boxes[i], zoomin)
|
108 |
i += 1
|
109 |
break
|
|
|
99 |
i += 1
|
100 |
txt = b["text"].lower().strip()
|
101 |
if re.match("(abstract|摘要)", txt):
|
102 |
+
if len(txt.split()) > 32 or len(txt) > 64:
|
103 |
abstr = txt + self._line_tag(b, zoomin)
|
104 |
break
|
105 |
txt = self.boxes[i]["text"].lower().strip()
|
106 |
+
if len(txt.split()) > 32 or len(txt) > 64:
|
107 |
abstr = txt + self._line_tag(self.boxes[i], zoomin)
|
108 |
i += 1
|
109 |
break
|
rag/app/picture.py
CHANGED
@@ -33,7 +33,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
|
33 |
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
34 |
eng = lang.lower() == "english"
|
35 |
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
|
36 |
-
if (eng and len(txt.split(
|
37 |
tokenize(doc, txt, eng)
|
38 |
callback(0.8, "OCR results is too long to use CV LLM.")
|
39 |
return [doc]
|
|
|
33 |
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
34 |
eng = lang.lower() == "english"
|
35 |
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
|
36 |
+
if (eng and len(txt.split()) > 32) or len(txt) > 32:
|
37 |
tokenize(doc, txt, eng)
|
38 |
callback(0.8, "OCR results is too long to use CV LLM.")
|
39 |
return [doc]
|
rag/nlp/__init__.py
CHANGED
@@ -325,12 +325,12 @@ def remove_contents_table(sections, eng=False):
|
|
325 |
sections.pop(i)
|
326 |
if i >= len(sections):
|
327 |
break
|
328 |
-
prefix = get(i)[:3] if not eng else " ".join(get(i).split(
|
329 |
while not prefix:
|
330 |
sections.pop(i)
|
331 |
if i >= len(sections):
|
332 |
break
|
333 |
-
prefix = get(i)[:3] if not eng else " ".join(get(i).split(
|
334 |
sections.pop(i)
|
335 |
if i >= len(sections) or not prefix:
|
336 |
break
|
@@ -389,7 +389,7 @@ def title_frequency(bull, sections):
|
|
389 |
def not_title(txt):
|
390 |
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
|
391 |
return False
|
392 |
-
if len(txt.split(
|
393 |
return True
|
394 |
return re.search(r"[,;,。;!!]", txt)
|
395 |
|
|
|
325 |
sections.pop(i)
|
326 |
if i >= len(sections):
|
327 |
break
|
328 |
+
prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
|
329 |
while not prefix:
|
330 |
sections.pop(i)
|
331 |
if i >= len(sections):
|
332 |
break
|
333 |
+
prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
|
334 |
sections.pop(i)
|
335 |
if i >= len(sections) or not prefix:
|
336 |
break
|
|
|
389 |
def not_title(txt):
|
390 |
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
|
391 |
return False
|
392 |
+
if len(txt.split()) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
|
393 |
return True
|
394 |
return re.search(r"[,;,。;!!]", txt)
|
395 |
|
rag/nlp/query.py
CHANGED
@@ -74,7 +74,7 @@ class FulltextQueryer:
|
|
74 |
|
75 |
if not self.isChinese(txt):
|
76 |
txt = FulltextQueryer.rmWWW(txt)
|
77 |
-
tks = rag_tokenizer.tokenize(txt).split(
|
78 |
keywords = [t for t in tks if t]
|
79 |
tks_w = self.tw.weights(tks, preprocess=False)
|
80 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
@@ -83,7 +83,7 @@ class FulltextQueryer:
|
|
83 |
syns = []
|
84 |
for tk, w in tks_w:
|
85 |
syn = self.syn.lookup(tk)
|
86 |
-
syn = rag_tokenizer.tokenize(" ".join(syn)).split(
|
87 |
keywords.extend(syn)
|
88 |
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
|
89 |
syns.append(" ".join(syn))
|
@@ -114,7 +114,7 @@ class FulltextQueryer:
|
|
114 |
|
115 |
txt = FulltextQueryer.rmWWW(txt)
|
116 |
qs, keywords = [], []
|
117 |
-
for tt in self.tw.split(txt)[:256]: # .split(
|
118 |
if not tt:
|
119 |
continue
|
120 |
keywords.append(tt)
|
@@ -125,7 +125,7 @@ class FulltextQueryer:
|
|
125 |
tms = []
|
126 |
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
127 |
sm = (
|
128 |
-
rag_tokenizer.fine_grained_tokenize(tk).split(
|
129 |
if need_fine_grained_tokenize(tk)
|
130 |
else []
|
131 |
)
|
@@ -194,7 +194,7 @@ class FulltextQueryer:
|
|
194 |
def toDict(tks):
|
195 |
d = {}
|
196 |
if isinstance(tks, str):
|
197 |
-
tks = tks.split(
|
198 |
for t, c in self.tw.weights(tks, preprocess=False):
|
199 |
if t not in d:
|
200 |
d[t] = 0
|
|
|
74 |
|
75 |
if not self.isChinese(txt):
|
76 |
txt = FulltextQueryer.rmWWW(txt)
|
77 |
+
tks = rag_tokenizer.tokenize(txt).split()
|
78 |
keywords = [t for t in tks if t]
|
79 |
tks_w = self.tw.weights(tks, preprocess=False)
|
80 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
|
|
83 |
syns = []
|
84 |
for tk, w in tks_w:
|
85 |
syn = self.syn.lookup(tk)
|
86 |
+
syn = rag_tokenizer.tokenize(" ".join(syn)).split()
|
87 |
keywords.extend(syn)
|
88 |
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
|
89 |
syns.append(" ".join(syn))
|
|
|
114 |
|
115 |
txt = FulltextQueryer.rmWWW(txt)
|
116 |
qs, keywords = [], []
|
117 |
+
for tt in self.tw.split(txt)[:256]: # .split():
|
118 |
if not tt:
|
119 |
continue
|
120 |
keywords.append(tt)
|
|
|
125 |
tms = []
|
126 |
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
127 |
sm = (
|
128 |
+
rag_tokenizer.fine_grained_tokenize(tk).split()
|
129 |
if need_fine_grained_tokenize(tk)
|
130 |
else []
|
131 |
)
|
|
|
194 |
def toDict(tks):
|
195 |
d = {}
|
196 |
if isinstance(tks, str):
|
197 |
+
tks = tks.split()
|
198 |
for t, c in self.tw.weights(tks, preprocess=False):
|
199 |
if t not in d:
|
200 |
d[t] = 0
|
rag/nlp/rag_tokenizer.py
CHANGED
@@ -192,7 +192,7 @@ class RagTokenizer:
|
|
192 |
|
193 |
# if split chars is part of token
|
194 |
res = []
|
195 |
-
tks = re.sub(r"[ ]+", " ", tks).split(
|
196 |
s = 0
|
197 |
while True:
|
198 |
if s >= len(tks):
|
@@ -329,7 +329,7 @@ class RagTokenizer:
|
|
329 |
return self.merge_(res)
|
330 |
|
331 |
def fine_grained_tokenize(self, tks):
|
332 |
-
tks = tks.split(
|
333 |
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
334 |
if zh_num < len(tks) * 0.2:
|
335 |
res = []
|
@@ -393,7 +393,7 @@ def is_alphabet(s):
|
|
393 |
|
394 |
def naiveQie(txt):
|
395 |
tks = []
|
396 |
-
for t in txt.split(
|
397 |
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
|
398 |
) and re.match(r".*[a-zA-Z]$", t):
|
399 |
tks.append(" ")
|
|
|
192 |
|
193 |
# if split chars is part of token
|
194 |
res = []
|
195 |
+
tks = re.sub(r"[ ]+", " ", tks).split()
|
196 |
s = 0
|
197 |
while True:
|
198 |
if s >= len(tks):
|
|
|
329 |
return self.merge_(res)
|
330 |
|
331 |
def fine_grained_tokenize(self, tks):
|
332 |
+
tks = tks.split()
|
333 |
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
334 |
if zh_num < len(tks) * 0.2:
|
335 |
res = []
|
|
|
393 |
|
394 |
def naiveQie(txt):
|
395 |
tks = []
|
396 |
+
for t in txt.split():
|
397 |
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
|
398 |
) and re.match(r".*[a-zA-Z]$", t):
|
399 |
tks.append(" ")
|
rag/nlp/search.py
CHANGED
@@ -114,7 +114,7 @@ class Dealer:
|
|
114 |
|
115 |
for k in keywords:
|
116 |
kwds.add(k)
|
117 |
-
for kk in rag_tokenizer.fine_grained_tokenize(k).split(
|
118 |
if len(kk) < 2:
|
119 |
continue
|
120 |
if kk in kwds:
|
@@ -186,7 +186,7 @@ class Dealer:
|
|
186 |
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
187 |
len(ans_v[0]), len(chunk_v[0]))
|
188 |
|
189 |
-
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(
|
190 |
for ck in chunks]
|
191 |
cites = {}
|
192 |
thr = 0.63
|
@@ -195,7 +195,7 @@ class Dealer:
|
|
195 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
196 |
chunk_v,
|
197 |
rag_tokenizer.tokenize(
|
198 |
-
self.qryr.rmWWW(pieces_[i])).split(
|
199 |
chunks_tks,
|
200 |
tkweight, vtweight)
|
201 |
mx = np.max(sim) * 0.99
|
@@ -244,8 +244,8 @@ class Dealer:
|
|
244 |
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
245 |
ins_tw = []
|
246 |
for i in sres.ids:
|
247 |
-
content_ltks = sres.field[i][cfield].split(
|
248 |
-
title_tks = [t for t in sres.field[i].get("title_tks", "").split(
|
249 |
important_kwd = sres.field[i].get("important_kwd", [])
|
250 |
tks = content_ltks + title_tks + important_kwd
|
251 |
ins_tw.append(tks)
|
@@ -265,8 +265,8 @@ class Dealer:
|
|
265 |
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
266 |
ins_tw = []
|
267 |
for i in sres.ids:
|
268 |
-
content_ltks = sres.field[i][cfield].split(
|
269 |
-
title_tks = [t for t in sres.field[i].get("title_tks", "").split(
|
270 |
important_kwd = sres.field[i].get("important_kwd", [])
|
271 |
tks = content_ltks + title_tks + important_kwd
|
272 |
ins_tw.append(tks)
|
@@ -279,8 +279,8 @@ class Dealer:
|
|
279 |
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
280 |
return self.qryr.hybrid_similarity(ans_embd,
|
281 |
ins_embd,
|
282 |
-
rag_tokenizer.tokenize(ans).split(
|
283 |
-
rag_tokenizer.tokenize(inst).split(
|
284 |
|
285 |
def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
|
286 |
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
|
|
|
114 |
|
115 |
for k in keywords:
|
116 |
kwds.add(k)
|
117 |
+
for kk in rag_tokenizer.fine_grained_tokenize(k).split():
|
118 |
if len(kk) < 2:
|
119 |
continue
|
120 |
if kk in kwds:
|
|
|
186 |
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
187 |
len(ans_v[0]), len(chunk_v[0]))
|
188 |
|
189 |
+
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
|
190 |
for ck in chunks]
|
191 |
cites = {}
|
192 |
thr = 0.63
|
|
|
195 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
196 |
chunk_v,
|
197 |
rag_tokenizer.tokenize(
|
198 |
+
self.qryr.rmWWW(pieces_[i])).split(),
|
199 |
chunks_tks,
|
200 |
tkweight, vtweight)
|
201 |
mx = np.max(sim) * 0.99
|
|
|
244 |
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
245 |
ins_tw = []
|
246 |
for i in sres.ids:
|
247 |
+
content_ltks = sres.field[i][cfield].split()
|
248 |
+
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
249 |
important_kwd = sres.field[i].get("important_kwd", [])
|
250 |
tks = content_ltks + title_tks + important_kwd
|
251 |
ins_tw.append(tks)
|
|
|
265 |
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
266 |
ins_tw = []
|
267 |
for i in sres.ids:
|
268 |
+
content_ltks = sres.field[i][cfield].split()
|
269 |
+
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
270 |
important_kwd = sres.field[i].get("important_kwd", [])
|
271 |
tks = content_ltks + title_tks + important_kwd
|
272 |
ins_tw.append(tks)
|
|
|
279 |
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
280 |
return self.qryr.hybrid_similarity(ans_embd,
|
281 |
ins_embd,
|
282 |
+
rag_tokenizer.tokenize(ans).split(),
|
283 |
+
rag_tokenizer.tokenize(inst).split())
|
284 |
|
285 |
def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
|
286 |
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
|
rag/nlp/term_weight.py
CHANGED
@@ -99,7 +99,7 @@ class Dealer:
|
|
99 |
txt = re.sub(p, r, txt)
|
100 |
|
101 |
res = []
|
102 |
-
for t in rag_tokenizer.tokenize(txt).split(
|
103 |
tk = t
|
104 |
if (stpwd and tk in self.stop_words) or (
|
105 |
re.match(r"[0-9]$", tk) and not num):
|
@@ -150,7 +150,7 @@ class Dealer:
|
|
150 |
|
151 |
def split(self, txt):
|
152 |
tks = []
|
153 |
-
for t in re.sub(r"[ \t]+", " ", txt).split(
|
154 |
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
155 |
re.match(r".*[a-zA-Z]$", t) and tks and \
|
156 |
self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
|
@@ -198,7 +198,7 @@ class Dealer:
|
|
198 |
s = 0
|
199 |
|
200 |
if not s and len(t) >= 4:
|
201 |
-
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(
|
202 |
if len(s) > 1:
|
203 |
s = np.min([freq(tt) for tt in s]) / 6.
|
204 |
else:
|
@@ -214,7 +214,7 @@ class Dealer:
|
|
214 |
elif re.match(r"[a-z. -]+$", t):
|
215 |
return 300
|
216 |
elif len(t) >= 4:
|
217 |
-
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(
|
218 |
if len(s) > 1:
|
219 |
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
220 |
|
|
|
99 |
txt = re.sub(p, r, txt)
|
100 |
|
101 |
res = []
|
102 |
+
for t in rag_tokenizer.tokenize(txt).split():
|
103 |
tk = t
|
104 |
if (stpwd and tk in self.stop_words) or (
|
105 |
re.match(r"[0-9]$", tk) and not num):
|
|
|
150 |
|
151 |
def split(self, txt):
|
152 |
tks = []
|
153 |
+
for t in re.sub(r"[ \t]+", " ", txt).split():
|
154 |
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
155 |
re.match(r".*[a-zA-Z]$", t) and tks and \
|
156 |
self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
|
|
|
198 |
s = 0
|
199 |
|
200 |
if not s and len(t) >= 4:
|
201 |
+
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
|
202 |
if len(s) > 1:
|
203 |
s = np.min([freq(tt) for tt in s]) / 6.
|
204 |
else:
|
|
|
214 |
elif re.match(r"[a-z. -]+$", t):
|
215 |
return 300
|
216 |
elif len(t) >= 4:
|
217 |
+
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
|
218 |
if len(s) > 1:
|
219 |
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
220 |
|
rag/utils/es_conn.py
CHANGED
@@ -85,6 +85,9 @@ class ESConnection(DocStoreConnection):
|
|
85 |
logging.exception("ESConnection.createIndex error %s" % (indexName))
|
86 |
|
87 |
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
|
|
|
|
|
|
88 |
try:
|
89 |
self.es.indices.delete(index=indexName, allow_no_indices=True)
|
90 |
except NotFoundError:
|
@@ -400,7 +403,7 @@ class ESConnection(DocStoreConnection):
|
|
400 |
if not hlts:
|
401 |
continue
|
402 |
txt = "...".join([a for a in list(hlts.items())[0][1]])
|
403 |
-
if not is_english(txt.split(
|
404 |
ans[d["_id"]] = txt
|
405 |
continue
|
406 |
|
|
|
85 |
logging.exception("ESConnection.createIndex error %s" % (indexName))
|
86 |
|
87 |
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
88 |
+
if len(knowledgebaseId) > 0:
|
89 |
+
# The index need to be alive after any kb deletion since all kb under this tenant are in one index.
|
90 |
+
return
|
91 |
try:
|
92 |
self.es.indices.delete(index=indexName, allow_no_indices=True)
|
93 |
except NotFoundError:
|
|
|
403 |
if not hlts:
|
404 |
continue
|
405 |
txt = "...".join([a for a in list(hlts.items())[0][1]])
|
406 |
+
if not is_english(txt.split()):
|
407 |
ans[d["_id"]] = txt
|
408 |
continue
|
409 |
|
rag/utils/infinity_conn.py
CHANGED
@@ -419,7 +419,7 @@ class InfinityConnection(DocStoreConnection):
|
|
419 |
v = list(v)
|
420 |
elif fieldnm == "important_kwd":
|
421 |
assert isinstance(v, str)
|
422 |
-
v = v.split(
|
423 |
else:
|
424 |
if not isinstance(v, str):
|
425 |
v = str(v)
|
|
|
419 |
v = list(v)
|
420 |
elif fieldnm == "important_kwd":
|
421 |
assert isinstance(v, str)
|
422 |
+
v = v.split()
|
423 |
else:
|
424 |
if not isinstance(v, str):
|
425 |
v = str(v)
|