Spaces:

retopara
/

ragflow

Build error

zhichyu commited on Nov 28, 2024

Commit

1b2aab6

1 Parent(s): 52b2996

Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (19) hide show

api/apps/chunk_app.py +1 -1
api/apps/kb_app.py +3 -1
api/apps/user_app.py +1 -1
deepdoc/parser/docx_parser.py +1 -1
deepdoc/parser/pdf_parser.py +5 -5
deepdoc/parser/resume/entities/corporations.py +1 -1
deepdoc/parser/resume/entities/schools.py +1 -1
deepdoc/parser/resume/step_one.py +2 -2
deepdoc/parser/resume/step_two.py +3 -3
deepdoc/vision/table_structure_recognizer.py +1 -1
rag/app/paper.py +2 -2
rag/app/picture.py +1 -1
rag/nlp/__init__.py +3 -3
rag/nlp/query.py +5 -5
rag/nlp/rag_tokenizer.py +3 -3
rag/nlp/search.py +9 -9
rag/nlp/term_weight.py +4 -4
rag/utils/es_conn.py +4 -1
rag/utils/infinity_conn.py +1 -1

api/apps/chunk_app.py CHANGED Viewed

@@ -155,7 +155,7 @@ def set():
         v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
         v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
         d["q_%d_vec" % len(v)] = v.tolist()
-        settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)

         v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
         v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
         d["q_%d_vec" % len(v)] = v.tolist()
+        settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)

api/apps/kb_app.py CHANGED Viewed

@@ -168,7 +168,9 @@ def rm():
         if not KnowledgebaseService.delete_by_id(req["kb_id"]):
             return get_data_error_result(
                 message="Database error (Knowledgebase removal)!")
-        settings.docStoreConn.delete({"kb_id": req["kb_id"]}, search.index_name(kbs[0].tenant_id), req["kb_id"])
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)

         if not KnowledgebaseService.delete_by_id(req["kb_id"]):
             return get_data_error_result(
                 message="Database error (Knowledgebase removal)!")
+        for kb in kbs:
+            settings.docStoreConn.delete({"kb_id": kb.id}, search.index_name(kb.tenant_id), kb.id)
+            settings.docStoreConn.deleteIdx(search.index_name(kb.tenant_id), kb.id)
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)

api/apps/user_app.py CHANGED Viewed

@@ -252,7 +252,7 @@ def feishu_callback():
     if res["code"] != 0:
         return redirect("/?error=%s" % res["message"])
-    if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
         return redirect("/?error=contact:user.email:readonly not in scope")
     session["access_token"] = res["data"]["access_token"]
     session["access_token_from"] = "feishu"

     if res["code"] != 0:
         return redirect("/?error=%s" % res["message"])
+    if "contact:user.email:readonly" not in res["data"]["scope"].split():
         return redirect("/?error=contact:user.email:readonly not in scope")
     session["access_token"] = res["data"]["access_token"]
     session["access_token_from"] = "feishu"

deepdoc/parser/docx_parser.py CHANGED Viewed

@@ -47,7 +47,7 @@ class RAGFlowDocxParser:
             for p, n in patt:
                 if re.search(p, b):
                     return n
-            tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
             if len(tks) > 3:
                 if len(tks) < 12:
                     return "Tx"

             for p, n in patt:
                 if re.search(p, b):
                     return n
+            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
             if len(tks) > 3:
                 if len(tks) < 12:
                     return "Tx"

deepdoc/parser/pdf_parser.py CHANGED Viewed

@@ -108,13 +108,13 @@ class RAGFlowPdfParser:
         h = max(self.__height(up), self.__height(down))
         y_dis = self._y_dis(up, down)
         LEN = 6
-        tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
-        tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
         tks_all = up["text"][-LEN:].strip() \
                   + (" " if re.match(r"[a-zA-Z0-9]+",
                                      up["text"][-1] + down["text"][0]) else "") \
                   + down["text"][:LEN].strip()
-        tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
         fea = [
             up.get("R", -1) == down.get("R", -1),
             y_dis / h,
@@ -565,13 +565,13 @@ class RAGFlowPdfParser:
             if i >= len(self.boxes):
                 break
             prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
-                self.boxes[i]["text"].strip().split(" ")[:2])
             while not prefix:
                 self.boxes.pop(i)
                 if i >= len(self.boxes):
                     break
                 prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
-                    self.boxes[i]["text"].strip().split(" ")[:2])
             self.boxes.pop(i)
             if i >= len(self.boxes) or not prefix:
                 break

         h = max(self.__height(up), self.__height(down))
         y_dis = self._y_dis(up, down)
         LEN = 6
+        tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
+        tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
         tks_all = up["text"][-LEN:].strip() \
                   + (" " if re.match(r"[a-zA-Z0-9]+",
                                      up["text"][-1] + down["text"][0]) else "") \
                   + down["text"][:LEN].strip()
+        tks_all = rag_tokenizer.tokenize(tks_all).split()
         fea = [
             up.get("R", -1) == down.get("R", -1),
             y_dis / h,
             if i >= len(self.boxes):
                 break
             prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
+                self.boxes[i]["text"].strip().split()[:2])
             while not prefix:
                 self.boxes.pop(i)
                 if i >= len(self.boxes):
                     break
                 prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
+                    self.boxes[i]["text"].strip().split()[:2])
             self.boxes.pop(i)
             if i >= len(self.boxes) or not prefix:
                 break

deepdoc/parser/resume/entities/corporations.py CHANGED Viewed

@@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True):
     nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
     if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
-    tks = rag_tokenizer.tokenize(nm).split(" ")
     reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
     nm = ""
     for t in tks:

     nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
     if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
+    tks = rag_tokenizer.tokenize(nm).split()
     reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
     nm = ""
     for t in tks:

deepdoc/parser/resume/entities/schools.py CHANGED Viewed

@@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
 def split(txt):
     tks = []
-    for t in re.sub(r"[ \t]+", " ",txt).split(" "):
         if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
            re.match(r"[a-zA-Z]", t) and tks:
             tks[-1] = tks[-1] + " " + t

 def split(txt):
     tks = []
+    for t in re.sub(r"[ \t]+", " ",txt).split():
         if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
            re.match(r"[a-zA-Z]", t) and tks:
             tks[-1] = tks[-1] + " " + t

deepdoc/parser/resume/step_one.py CHANGED Viewed

@@ -80,7 +80,7 @@ def refactor(df):
     def loadjson(line):
         try:
             return json.loads(line)
-        except Exception as e:
             pass
         return {}
@@ -183,4 +183,4 @@ def refactor(df):
                 "\r",
                 "\\n"))
     # print(df.values.tolist())
-    return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))

     def loadjson(line):
         try:
             return json.loads(line)
+        except Exception:
             pass
         return {}
                 "\r",
                 "\\n"))
     # print(df.values.tolist())
+    return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))

deepdoc/parser/resume/step_two.py CHANGED Viewed

@@ -100,7 +100,7 @@ def forEdu(cv):
         if n.get("school_name") and isinstance(n["school_name"], str):
             sch.append(re.sub(r"(211|985|重点大学|[,&;；-])", "", n["school_name"]))
             e["sch_nm_kwd"] = sch[-1]
-        fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
         if n.get("discipline_name") and isinstance(n["discipline_name"], str):
             maj.append(n["discipline_name"])
@@ -485,7 +485,7 @@ def parse(cv):
         nm = re.sub(r"[\n——\-\(（\+].*", "", cv["name"].strip())
         nm = re.sub(r"[ \t　]+", " ", nm)
         if re.match(r"[a-zA-Z ]+$", nm):
-            if len(nm.split(" ")) > 1:
                 cv["name"] = nm
             else:
                 nm = ""
@@ -503,7 +503,7 @@ def parse(cv):
         for py in PY.get_pinyins(nm[:20], ''):
             for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
         for py in PY.get_pinyins(nm[:20], ' '):
-            py = py.split(" ")
             for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
         cv["name_kwd"] = name

         if n.get("school_name") and isinstance(n["school_name"], str):
             sch.append(re.sub(r"(211|985|重点大学|[,&;；-])", "", n["school_name"]))
             e["sch_nm_kwd"] = sch[-1]
+        fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
         if n.get("discipline_name") and isinstance(n["discipline_name"], str):
             maj.append(n["discipline_name"])
         nm = re.sub(r"[\n——\-\(（\+].*", "", cv["name"].strip())
         nm = re.sub(r"[ \t　]+", " ", nm)
         if re.match(r"[a-zA-Z ]+$", nm):
+            if len(nm.split()) > 1:
                 cv["name"] = nm
             else:
                 nm = ""
         for py in PY.get_pinyins(nm[:20], ''):
             for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
         for py in PY.get_pinyins(nm[:20], ' '):
+            py = py.split()
             for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
         cv["name_kwd"] = name

deepdoc/vision/table_structure_recognizer.py CHANGED Viewed

@@ -117,7 +117,7 @@ class TableStructureRecognizer(Recognizer):
         for p, n in patt:
             if re.search(p, b["text"].strip()):
                 return n
-        tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
         if len(tks) > 3:
             if len(tks) < 12:
                 return "Tx"

         for p, n in patt:
             if re.search(p, b["text"].strip()):
                 return n
+        tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
         if len(tks) > 3:
             if len(tks) < 12:
                 return "Tx"

rag/app/paper.py CHANGED Viewed

@@ -99,11 +99,11 @@ class Pdf(PdfParser):
             i += 1
             txt = b["text"].lower().strip()
             if re.match("(abstract|摘要)", txt):
-                if len(txt.split(" ")) > 32 or len(txt) > 64:
                     abstr = txt + self._line_tag(b, zoomin)
                     break
                 txt = self.boxes[i]["text"].lower().strip()
-                if len(txt.split(" ")) > 32 or len(txt) > 64:
                     abstr = txt + self._line_tag(self.boxes[i], zoomin)
                 i += 1
                 break

             i += 1
             txt = b["text"].lower().strip()
             if re.match("(abstract|摘要)", txt):
+                if len(txt.split()) > 32 or len(txt) > 64:
                     abstr = txt + self._line_tag(b, zoomin)
                     break
                 txt = self.boxes[i]["text"].lower().strip()
+                if len(txt.split()) > 32 or len(txt) > 64:
                     abstr = txt + self._line_tag(self.boxes[i], zoomin)
                 i += 1
                 break

rag/app/picture.py CHANGED Viewed

@@ -33,7 +33,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
     txt = "\n".join([t[0] for _, t in bxs if t[0]])
     eng = lang.lower() == "english"
     callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
-    if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
         tokenize(doc, txt, eng)
         callback(0.8, "OCR results is too long to use CV LLM.")
         return [doc]

     txt = "\n".join([t[0] for _, t in bxs if t[0]])
     eng = lang.lower() == "english"
     callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
+    if (eng and len(txt.split()) > 32) or len(txt) > 32:
         tokenize(doc, txt, eng)
         callback(0.8, "OCR results is too long to use CV LLM.")
         return [doc]

rag/nlp/__init__.py CHANGED Viewed

@@ -325,12 +325,12 @@ def remove_contents_table(sections, eng=False):
         sections.pop(i)
         if i >= len(sections):
             break
-        prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
         while not prefix:
             sections.pop(i)
             if i >= len(sections):
                 break
-            prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
         sections.pop(i)
         if i >= len(sections) or not prefix:
             break
@@ -389,7 +389,7 @@ def title_frequency(bull, sections):
 def not_title(txt):
     if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
         return False
-    if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
         return True
     return re.search(r"[,;，。；！!]", txt)

         sections.pop(i)
         if i >= len(sections):
             break
+        prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
         while not prefix:
             sections.pop(i)
             if i >= len(sections):
                 break
+            prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
         sections.pop(i)
         if i >= len(sections) or not prefix:
             break
 def not_title(txt):
     if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
         return False
+    if len(txt.split()) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
         return True
     return re.search(r"[,;，。；！!]", txt)

rag/nlp/query.py CHANGED Viewed

@@ -74,7 +74,7 @@ class FulltextQueryer:
         if not self.isChinese(txt):
             txt = FulltextQueryer.rmWWW(txt)
-            tks = rag_tokenizer.tokenize(txt).split(" ")
             keywords = [t for t in tks if t]
             tks_w = self.tw.weights(tks, preprocess=False)
             tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
@@ -83,7 +83,7 @@ class FulltextQueryer:
             syns = []
             for tk, w in tks_w:
                 syn = self.syn.lookup(tk)
-                syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
                 keywords.extend(syn)
                 syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
                 syns.append(" ".join(syn))
@@ -114,7 +114,7 @@ class FulltextQueryer:
         txt = FulltextQueryer.rmWWW(txt)
         qs, keywords = [], []
-        for tt in self.tw.split(txt)[:256]:  # .split(" "):
             if not tt:
                 continue
             keywords.append(tt)
@@ -125,7 +125,7 @@ class FulltextQueryer:
             tms = []
             for tk, w in sorted(twts, key=lambda x: x[1] * -1):
                 sm = (
-                    rag_tokenizer.fine_grained_tokenize(tk).split(" ")
                     if need_fine_grained_tokenize(tk)
                     else []
                 )
@@ -194,7 +194,7 @@ class FulltextQueryer:
         def toDict(tks):
             d = {}
             if isinstance(tks, str):
-                tks = tks.split(" ")
             for t, c in self.tw.weights(tks, preprocess=False):
                 if t not in d:
                     d[t] = 0

         if not self.isChinese(txt):
             txt = FulltextQueryer.rmWWW(txt)
+            tks = rag_tokenizer.tokenize(txt).split()
             keywords = [t for t in tks if t]
             tks_w = self.tw.weights(tks, preprocess=False)
             tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
             syns = []
             for tk, w in tks_w:
                 syn = self.syn.lookup(tk)
+                syn = rag_tokenizer.tokenize(" ".join(syn)).split()
                 keywords.extend(syn)
                 syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
                 syns.append(" ".join(syn))
         txt = FulltextQueryer.rmWWW(txt)
         qs, keywords = [], []
+        for tt in self.tw.split(txt)[:256]:  # .split():
             if not tt:
                 continue
             keywords.append(tt)
             tms = []
             for tk, w in sorted(twts, key=lambda x: x[1] * -1):
                 sm = (
+                    rag_tokenizer.fine_grained_tokenize(tk).split()
                     if need_fine_grained_tokenize(tk)
                     else []
                 )
         def toDict(tks):
             d = {}
             if isinstance(tks, str):
+                tks = tks.split()
             for t, c in self.tw.weights(tks, preprocess=False):
                 if t not in d:
                     d[t] = 0

rag/nlp/rag_tokenizer.py CHANGED Viewed

@@ -192,7 +192,7 @@ class RagTokenizer:
         # if split chars is part of token
         res = []
-        tks = re.sub(r"[ ]+", " ", tks).split(" ")
         s = 0
         while True:
             if s >= len(tks):
@@ -329,7 +329,7 @@ class RagTokenizer:
         return self.merge_(res)
     def fine_grained_tokenize(self, tks):
-        tks = tks.split(" ")
         zh_num = len([1 for c in tks if c and is_chinese(c[0])])
         if zh_num < len(tks) * 0.2:
             res = []
@@ -393,7 +393,7 @@ def is_alphabet(s):
 def naiveQie(txt):
     tks = []
-    for t in txt.split(" "):
         if tks and re.match(r".*[a-zA-Z]$", tks[-1]
                             ) and re.match(r".*[a-zA-Z]$", t):
             tks.append(" ")

         # if split chars is part of token
         res = []
+        tks = re.sub(r"[ ]+", " ", tks).split()
         s = 0
         while True:
             if s >= len(tks):
         return self.merge_(res)
     def fine_grained_tokenize(self, tks):
+        tks = tks.split()
         zh_num = len([1 for c in tks if c and is_chinese(c[0])])
         if zh_num < len(tks) * 0.2:
             res = []
 def naiveQie(txt):
     tks = []
+    for t in txt.split():
         if tks and re.match(r".*[a-zA-Z]$", tks[-1]
                             ) and re.match(r".*[a-zA-Z]$", t):
             tks.append(" ")

rag/nlp/search.py CHANGED Viewed

@@ -114,7 +114,7 @@ class Dealer:
             for k in keywords:
                 kwds.add(k)
-                for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
                     if len(kk) < 2:
                         continue
                     if kk in kwds:
@@ -186,7 +186,7 @@ class Dealer:
         assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
                 len(ans_v[0]), len(chunk_v[0]))
-        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
                       for ck in chunks]
         cites = {}
         thr = 0.63
@@ -195,7 +195,7 @@ class Dealer:
                 sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                                 chunk_v,
                                                                 rag_tokenizer.tokenize(
-                                                                    self.qryr.rmWWW(pieces_[i])).split(" "),
                                                                 chunks_tks,
                                                                 tkweight, vtweight)
                 mx = np.max(sim) * 0.99
@@ -244,8 +244,8 @@ class Dealer:
                 sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
         ins_tw = []
         for i in sres.ids:
-            content_ltks = sres.field[i][cfield].split(" ")
-            title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
             important_kwd = sres.field[i].get("important_kwd", [])
             tks = content_ltks + title_tks + important_kwd
             ins_tw.append(tks)
@@ -265,8 +265,8 @@ class Dealer:
                 sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
         ins_tw = []
         for i in sres.ids:
-            content_ltks = sres.field[i][cfield].split(" ")
-            title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
             important_kwd = sres.field[i].get("important_kwd", [])
             tks = content_ltks + title_tks + important_kwd
             ins_tw.append(tks)
@@ -279,8 +279,8 @@ class Dealer:
     def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
         return self.qryr.hybrid_similarity(ans_embd,
                                            ins_embd,
-                                           rag_tokenizer.tokenize(ans).split(" "),
-                                           rag_tokenizer.tokenize(inst).split(" "))
     def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
                   vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):

             for k in keywords:
                 kwds.add(k)
+                for kk in rag_tokenizer.fine_grained_tokenize(k).split():
                     if len(kk) < 2:
                         continue
                     if kk in kwds:
         assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
                 len(ans_v[0]), len(chunk_v[0]))
+        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
                       for ck in chunks]
         cites = {}
         thr = 0.63
                 sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                                 chunk_v,
                                                                 rag_tokenizer.tokenize(
+                                                                    self.qryr.rmWWW(pieces_[i])).split(),
                                                                 chunks_tks,
                                                                 tkweight, vtweight)
                 mx = np.max(sim) * 0.99
                 sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
         ins_tw = []
         for i in sres.ids:
+            content_ltks = sres.field[i][cfield].split()
+            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
             important_kwd = sres.field[i].get("important_kwd", [])
             tks = content_ltks + title_tks + important_kwd
             ins_tw.append(tks)
                 sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
         ins_tw = []
         for i in sres.ids:
+            content_ltks = sres.field[i][cfield].split()
+            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
             important_kwd = sres.field[i].get("important_kwd", [])
             tks = content_ltks + title_tks + important_kwd
             ins_tw.append(tks)
     def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
         return self.qryr.hybrid_similarity(ans_embd,
                                            ins_embd,
+                                           rag_tokenizer.tokenize(ans).split(),
+                                           rag_tokenizer.tokenize(inst).split())
     def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
                   vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):

rag/nlp/term_weight.py CHANGED Viewed

@@ -99,7 +99,7 @@ class Dealer:
             txt = re.sub(p, r, txt)
         res = []
-        for t in rag_tokenizer.tokenize(txt).split(" "):
             tk = t
             if (stpwd and tk in self.stop_words) or (
                     re.match(r"[0-9]$", tk) and not num):
@@ -150,7 +150,7 @@ class Dealer:
     def split(self, txt):
         tks = []
-        for t in re.sub(r"[ \t]+", " ", txt).split(" "):
             if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
                re.match(r".*[a-zA-Z]$", t) and tks and \
                self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
@@ -198,7 +198,7 @@ class Dealer:
                 s = 0
             if not s and len(t) >= 4:
-                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
                 if len(s) > 1:
                     s = np.min([freq(tt) for tt in s]) / 6.
                 else:
@@ -214,7 +214,7 @@ class Dealer:
             elif re.match(r"[a-z. -]+$", t):
                 return 300
             elif len(t) >= 4:
-                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
                 if len(s) > 1:
                     return max(3, np.min([df(tt) for tt in s]) / 6.)

             txt = re.sub(p, r, txt)
         res = []
+        for t in rag_tokenizer.tokenize(txt).split():
             tk = t
             if (stpwd and tk in self.stop_words) or (
                     re.match(r"[0-9]$", tk) and not num):
     def split(self, txt):
         tks = []
+        for t in re.sub(r"[ \t]+", " ", txt).split():
             if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
                re.match(r".*[a-zA-Z]$", t) and tks and \
                self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
                 s = 0
             if not s and len(t) >= 4:
+                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
                 if len(s) > 1:
                     s = np.min([freq(tt) for tt in s]) / 6.
                 else:
             elif re.match(r"[a-z. -]+$", t):
                 return 300
             elif len(t) >= 4:
+                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
                 if len(s) > 1:
                     return max(3, np.min([df(tt) for tt in s]) / 6.)

rag/utils/es_conn.py CHANGED Viewed

@@ -85,6 +85,9 @@ class ESConnection(DocStoreConnection):
             logging.exception("ESConnection.createIndex error %s" % (indexName))
     def deleteIdx(self, indexName: str, knowledgebaseId: str):
         try:
             self.es.indices.delete(index=indexName, allow_no_indices=True)
         except NotFoundError:
@@ -400,7 +403,7 @@ class ESConnection(DocStoreConnection):
             if not hlts:
                 continue
             txt = "...".join([a for a in list(hlts.items())[0][1]])
-            if not is_english(txt.split(" ")):
                 ans[d["_id"]] = txt
                 continue

             logging.exception("ESConnection.createIndex error %s" % (indexName))
     def deleteIdx(self, indexName: str, knowledgebaseId: str):
+        if len(knowledgebaseId) > 0:
+            # The index need to be alive after any kb deletion since all kb under this tenant are in one index.
+            return
         try:
             self.es.indices.delete(index=indexName, allow_no_indices=True)
         except NotFoundError:
             if not hlts:
                 continue
             txt = "...".join([a for a in list(hlts.items())[0][1]])
+            if not is_english(txt.split()):
                 ans[d["_id"]] = txt
                 continue

rag/utils/infinity_conn.py CHANGED Viewed

@@ -419,7 +419,7 @@ class InfinityConnection(DocStoreConnection):
                     v = list(v)
                 elif fieldnm == "important_kwd":
                     assert isinstance(v, str)
-                    v = v.split(" ")
                 else:
                     if not isinstance(v, str):
                         v = str(v)

                     v = list(v)
                 elif fieldnm == "important_kwd":
                     assert isinstance(v, str)
+                    v = v.split()
                 else:
                     if not isinstance(v, str):
                         v = str(v)