zhichyu commited on
Commit
1b2aab6
·
1 Parent(s): 52b2996

Edit chunk shall update instead of insert it (#3709)

Browse files

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/apps/chunk_app.py CHANGED
@@ -155,7 +155,7 @@ def set():
155
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
156
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
157
  d["q_%d_vec" % len(v)] = v.tolist()
158
- settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
159
  return get_json_result(data=True)
160
  except Exception as e:
161
  return server_error_response(e)
 
155
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
156
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
157
  d["q_%d_vec" % len(v)] = v.tolist()
158
+ settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id)
159
  return get_json_result(data=True)
160
  except Exception as e:
161
  return server_error_response(e)
api/apps/kb_app.py CHANGED
@@ -168,7 +168,9 @@ def rm():
168
  if not KnowledgebaseService.delete_by_id(req["kb_id"]):
169
  return get_data_error_result(
170
  message="Database error (Knowledgebase removal)!")
171
- settings.docStoreConn.delete({"kb_id": req["kb_id"]}, search.index_name(kbs[0].tenant_id), req["kb_id"])
 
 
172
  return get_json_result(data=True)
173
  except Exception as e:
174
  return server_error_response(e)
 
168
  if not KnowledgebaseService.delete_by_id(req["kb_id"]):
169
  return get_data_error_result(
170
  message="Database error (Knowledgebase removal)!")
171
+ for kb in kbs:
172
+ settings.docStoreConn.delete({"kb_id": kb.id}, search.index_name(kb.tenant_id), kb.id)
173
+ settings.docStoreConn.deleteIdx(search.index_name(kb.tenant_id), kb.id)
174
  return get_json_result(data=True)
175
  except Exception as e:
176
  return server_error_response(e)
api/apps/user_app.py CHANGED
@@ -252,7 +252,7 @@ def feishu_callback():
252
  if res["code"] != 0:
253
  return redirect("/?error=%s" % res["message"])
254
 
255
- if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
256
  return redirect("/?error=contact:user.email:readonly not in scope")
257
  session["access_token"] = res["data"]["access_token"]
258
  session["access_token_from"] = "feishu"
 
252
  if res["code"] != 0:
253
  return redirect("/?error=%s" % res["message"])
254
 
255
+ if "contact:user.email:readonly" not in res["data"]["scope"].split():
256
  return redirect("/?error=contact:user.email:readonly not in scope")
257
  session["access_token"] = res["data"]["access_token"]
258
  session["access_token_from"] = "feishu"
deepdoc/parser/docx_parser.py CHANGED
@@ -47,7 +47,7 @@ class RAGFlowDocxParser:
47
  for p, n in patt:
48
  if re.search(p, b):
49
  return n
50
- tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
51
  if len(tks) > 3:
52
  if len(tks) < 12:
53
  return "Tx"
 
47
  for p, n in patt:
48
  if re.search(p, b):
49
  return n
50
+ tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
51
  if len(tks) > 3:
52
  if len(tks) < 12:
53
  return "Tx"
deepdoc/parser/pdf_parser.py CHANGED
@@ -108,13 +108,13 @@ class RAGFlowPdfParser:
108
  h = max(self.__height(up), self.__height(down))
109
  y_dis = self._y_dis(up, down)
110
  LEN = 6
111
- tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
112
- tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
113
  tks_all = up["text"][-LEN:].strip() \
114
  + (" " if re.match(r"[a-zA-Z0-9]+",
115
  up["text"][-1] + down["text"][0]) else "") \
116
  + down["text"][:LEN].strip()
117
- tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
118
  fea = [
119
  up.get("R", -1) == down.get("R", -1),
120
  y_dis / h,
@@ -565,13 +565,13 @@ class RAGFlowPdfParser:
565
  if i >= len(self.boxes):
566
  break
567
  prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
568
- self.boxes[i]["text"].strip().split(" ")[:2])
569
  while not prefix:
570
  self.boxes.pop(i)
571
  if i >= len(self.boxes):
572
  break
573
  prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
574
- self.boxes[i]["text"].strip().split(" ")[:2])
575
  self.boxes.pop(i)
576
  if i >= len(self.boxes) or not prefix:
577
  break
 
108
  h = max(self.__height(up), self.__height(down))
109
  y_dis = self._y_dis(up, down)
110
  LEN = 6
111
+ tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
112
+ tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
113
  tks_all = up["text"][-LEN:].strip() \
114
  + (" " if re.match(r"[a-zA-Z0-9]+",
115
  up["text"][-1] + down["text"][0]) else "") \
116
  + down["text"][:LEN].strip()
117
+ tks_all = rag_tokenizer.tokenize(tks_all).split()
118
  fea = [
119
  up.get("R", -1) == down.get("R", -1),
120
  y_dis / h,
 
565
  if i >= len(self.boxes):
566
  break
567
  prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
568
+ self.boxes[i]["text"].strip().split()[:2])
569
  while not prefix:
570
  self.boxes.pop(i)
571
  if i >= len(self.boxes):
572
  break
573
  prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
574
+ self.boxes[i]["text"].strip().split()[:2])
575
  self.boxes.pop(i)
576
  if i >= len(self.boxes) or not prefix:
577
  break
deepdoc/parser/resume/entities/corporations.py CHANGED
@@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True):
47
  nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
48
  if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
49
 
50
- tks = rag_tokenizer.tokenize(nm).split(" ")
51
  reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
52
  nm = ""
53
  for t in tks:
 
47
  nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
48
  if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
49
 
50
+ tks = rag_tokenizer.tokenize(nm).split()
51
  reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
52
  nm = ""
53
  for t in tks:
deepdoc/parser/resume/entities/schools.py CHANGED
@@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
44
 
45
  def split(txt):
46
  tks = []
47
- for t in re.sub(r"[ \t]+", " ",txt).split(" "):
48
  if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
49
  re.match(r"[a-zA-Z]", t) and tks:
50
  tks[-1] = tks[-1] + " " + t
 
44
 
45
  def split(txt):
46
  tks = []
47
+ for t in re.sub(r"[ \t]+", " ",txt).split():
48
  if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
49
  re.match(r"[a-zA-Z]", t) and tks:
50
  tks[-1] = tks[-1] + " " + t
deepdoc/parser/resume/step_one.py CHANGED
@@ -80,7 +80,7 @@ def refactor(df):
80
  def loadjson(line):
81
  try:
82
  return json.loads(line)
83
- except Exception as e:
84
  pass
85
  return {}
86
 
@@ -183,4 +183,4 @@ def refactor(df):
183
  "\r",
184
  "\\n"))
185
  # print(df.values.tolist())
186
- return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
 
80
  def loadjson(line):
81
  try:
82
  return json.loads(line)
83
+ except Exception:
84
  pass
85
  return {}
86
 
 
183
  "\r",
184
  "\\n"))
185
  # print(df.values.tolist())
186
+ return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
deepdoc/parser/resume/step_two.py CHANGED
@@ -100,7 +100,7 @@ def forEdu(cv):
100
  if n.get("school_name") and isinstance(n["school_name"], str):
101
  sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
102
  e["sch_nm_kwd"] = sch[-1]
103
- fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
104
 
105
  if n.get("discipline_name") and isinstance(n["discipline_name"], str):
106
  maj.append(n["discipline_name"])
@@ -485,7 +485,7 @@ def parse(cv):
485
  nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
486
  nm = re.sub(r"[ \t ]+", " ", nm)
487
  if re.match(r"[a-zA-Z ]+$", nm):
488
- if len(nm.split(" ")) > 1:
489
  cv["name"] = nm
490
  else:
491
  nm = ""
@@ -503,7 +503,7 @@ def parse(cv):
503
  for py in PY.get_pinyins(nm[:20], ''):
504
  for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
505
  for py in PY.get_pinyins(nm[:20], ' '):
506
- py = py.split(" ")
507
  for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
508
 
509
  cv["name_kwd"] = name
 
100
  if n.get("school_name") and isinstance(n["school_name"], str):
101
  sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
102
  e["sch_nm_kwd"] = sch[-1]
103
+ fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
104
 
105
  if n.get("discipline_name") and isinstance(n["discipline_name"], str):
106
  maj.append(n["discipline_name"])
 
485
  nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
486
  nm = re.sub(r"[ \t ]+", " ", nm)
487
  if re.match(r"[a-zA-Z ]+$", nm):
488
+ if len(nm.split()) > 1:
489
  cv["name"] = nm
490
  else:
491
  nm = ""
 
503
  for py in PY.get_pinyins(nm[:20], ''):
504
  for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
505
  for py in PY.get_pinyins(nm[:20], ' '):
506
+ py = py.split()
507
  for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
508
 
509
  cv["name_kwd"] = name
deepdoc/vision/table_structure_recognizer.py CHANGED
@@ -117,7 +117,7 @@ class TableStructureRecognizer(Recognizer):
117
  for p, n in patt:
118
  if re.search(p, b["text"].strip()):
119
  return n
120
- tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
121
  if len(tks) > 3:
122
  if len(tks) < 12:
123
  return "Tx"
 
117
  for p, n in patt:
118
  if re.search(p, b["text"].strip()):
119
  return n
120
+ tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
121
  if len(tks) > 3:
122
  if len(tks) < 12:
123
  return "Tx"
rag/app/paper.py CHANGED
@@ -99,11 +99,11 @@ class Pdf(PdfParser):
99
  i += 1
100
  txt = b["text"].lower().strip()
101
  if re.match("(abstract|摘要)", txt):
102
- if len(txt.split(" ")) > 32 or len(txt) > 64:
103
  abstr = txt + self._line_tag(b, zoomin)
104
  break
105
  txt = self.boxes[i]["text"].lower().strip()
106
- if len(txt.split(" ")) > 32 or len(txt) > 64:
107
  abstr = txt + self._line_tag(self.boxes[i], zoomin)
108
  i += 1
109
  break
 
99
  i += 1
100
  txt = b["text"].lower().strip()
101
  if re.match("(abstract|摘要)", txt):
102
+ if len(txt.split()) > 32 or len(txt) > 64:
103
  abstr = txt + self._line_tag(b, zoomin)
104
  break
105
  txt = self.boxes[i]["text"].lower().strip()
106
+ if len(txt.split()) > 32 or len(txt) > 64:
107
  abstr = txt + self._line_tag(self.boxes[i], zoomin)
108
  i += 1
109
  break
rag/app/picture.py CHANGED
@@ -33,7 +33,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
33
  txt = "\n".join([t[0] for _, t in bxs if t[0]])
34
  eng = lang.lower() == "english"
35
  callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
36
- if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
37
  tokenize(doc, txt, eng)
38
  callback(0.8, "OCR results is too long to use CV LLM.")
39
  return [doc]
 
33
  txt = "\n".join([t[0] for _, t in bxs if t[0]])
34
  eng = lang.lower() == "english"
35
  callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
36
+ if (eng and len(txt.split()) > 32) or len(txt) > 32:
37
  tokenize(doc, txt, eng)
38
  callback(0.8, "OCR results is too long to use CV LLM.")
39
  return [doc]
rag/nlp/__init__.py CHANGED
@@ -325,12 +325,12 @@ def remove_contents_table(sections, eng=False):
325
  sections.pop(i)
326
  if i >= len(sections):
327
  break
328
- prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
329
  while not prefix:
330
  sections.pop(i)
331
  if i >= len(sections):
332
  break
333
- prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
334
  sections.pop(i)
335
  if i >= len(sections) or not prefix:
336
  break
@@ -389,7 +389,7 @@ def title_frequency(bull, sections):
389
  def not_title(txt):
390
  if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
391
  return False
392
- if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
393
  return True
394
  return re.search(r"[,;,。;!!]", txt)
395
 
 
325
  sections.pop(i)
326
  if i >= len(sections):
327
  break
328
+ prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
329
  while not prefix:
330
  sections.pop(i)
331
  if i >= len(sections):
332
  break
333
+ prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
334
  sections.pop(i)
335
  if i >= len(sections) or not prefix:
336
  break
 
389
  def not_title(txt):
390
  if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
391
  return False
392
+ if len(txt.split()) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
393
  return True
394
  return re.search(r"[,;,。;!!]", txt)
395
 
rag/nlp/query.py CHANGED
@@ -74,7 +74,7 @@ class FulltextQueryer:
74
 
75
  if not self.isChinese(txt):
76
  txt = FulltextQueryer.rmWWW(txt)
77
- tks = rag_tokenizer.tokenize(txt).split(" ")
78
  keywords = [t for t in tks if t]
79
  tks_w = self.tw.weights(tks, preprocess=False)
80
  tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
@@ -83,7 +83,7 @@ class FulltextQueryer:
83
  syns = []
84
  for tk, w in tks_w:
85
  syn = self.syn.lookup(tk)
86
- syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
87
  keywords.extend(syn)
88
  syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
89
  syns.append(" ".join(syn))
@@ -114,7 +114,7 @@ class FulltextQueryer:
114
 
115
  txt = FulltextQueryer.rmWWW(txt)
116
  qs, keywords = [], []
117
- for tt in self.tw.split(txt)[:256]: # .split(" "):
118
  if not tt:
119
  continue
120
  keywords.append(tt)
@@ -125,7 +125,7 @@ class FulltextQueryer:
125
  tms = []
126
  for tk, w in sorted(twts, key=lambda x: x[1] * -1):
127
  sm = (
128
- rag_tokenizer.fine_grained_tokenize(tk).split(" ")
129
  if need_fine_grained_tokenize(tk)
130
  else []
131
  )
@@ -194,7 +194,7 @@ class FulltextQueryer:
194
  def toDict(tks):
195
  d = {}
196
  if isinstance(tks, str):
197
- tks = tks.split(" ")
198
  for t, c in self.tw.weights(tks, preprocess=False):
199
  if t not in d:
200
  d[t] = 0
 
74
 
75
  if not self.isChinese(txt):
76
  txt = FulltextQueryer.rmWWW(txt)
77
+ tks = rag_tokenizer.tokenize(txt).split()
78
  keywords = [t for t in tks if t]
79
  tks_w = self.tw.weights(tks, preprocess=False)
80
  tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
 
83
  syns = []
84
  for tk, w in tks_w:
85
  syn = self.syn.lookup(tk)
86
+ syn = rag_tokenizer.tokenize(" ".join(syn)).split()
87
  keywords.extend(syn)
88
  syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
89
  syns.append(" ".join(syn))
 
114
 
115
  txt = FulltextQueryer.rmWWW(txt)
116
  qs, keywords = [], []
117
+ for tt in self.tw.split(txt)[:256]: # .split():
118
  if not tt:
119
  continue
120
  keywords.append(tt)
 
125
  tms = []
126
  for tk, w in sorted(twts, key=lambda x: x[1] * -1):
127
  sm = (
128
+ rag_tokenizer.fine_grained_tokenize(tk).split()
129
  if need_fine_grained_tokenize(tk)
130
  else []
131
  )
 
194
  def toDict(tks):
195
  d = {}
196
  if isinstance(tks, str):
197
+ tks = tks.split()
198
  for t, c in self.tw.weights(tks, preprocess=False):
199
  if t not in d:
200
  d[t] = 0
rag/nlp/rag_tokenizer.py CHANGED
@@ -192,7 +192,7 @@ class RagTokenizer:
192
 
193
  # if split chars is part of token
194
  res = []
195
- tks = re.sub(r"[ ]+", " ", tks).split(" ")
196
  s = 0
197
  while True:
198
  if s >= len(tks):
@@ -329,7 +329,7 @@ class RagTokenizer:
329
  return self.merge_(res)
330
 
331
  def fine_grained_tokenize(self, tks):
332
- tks = tks.split(" ")
333
  zh_num = len([1 for c in tks if c and is_chinese(c[0])])
334
  if zh_num < len(tks) * 0.2:
335
  res = []
@@ -393,7 +393,7 @@ def is_alphabet(s):
393
 
394
  def naiveQie(txt):
395
  tks = []
396
- for t in txt.split(" "):
397
  if tks and re.match(r".*[a-zA-Z]$", tks[-1]
398
  ) and re.match(r".*[a-zA-Z]$", t):
399
  tks.append(" ")
 
192
 
193
  # if split chars is part of token
194
  res = []
195
+ tks = re.sub(r"[ ]+", " ", tks).split()
196
  s = 0
197
  while True:
198
  if s >= len(tks):
 
329
  return self.merge_(res)
330
 
331
  def fine_grained_tokenize(self, tks):
332
+ tks = tks.split()
333
  zh_num = len([1 for c in tks if c and is_chinese(c[0])])
334
  if zh_num < len(tks) * 0.2:
335
  res = []
 
393
 
394
  def naiveQie(txt):
395
  tks = []
396
+ for t in txt.split():
397
  if tks and re.match(r".*[a-zA-Z]$", tks[-1]
398
  ) and re.match(r".*[a-zA-Z]$", t):
399
  tks.append(" ")
rag/nlp/search.py CHANGED
@@ -114,7 +114,7 @@ class Dealer:
114
 
115
  for k in keywords:
116
  kwds.add(k)
117
- for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
118
  if len(kk) < 2:
119
  continue
120
  if kk in kwds:
@@ -186,7 +186,7 @@ class Dealer:
186
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
187
  len(ans_v[0]), len(chunk_v[0]))
188
 
189
- chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
190
  for ck in chunks]
191
  cites = {}
192
  thr = 0.63
@@ -195,7 +195,7 @@ class Dealer:
195
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
196
  chunk_v,
197
  rag_tokenizer.tokenize(
198
- self.qryr.rmWWW(pieces_[i])).split(" "),
199
  chunks_tks,
200
  tkweight, vtweight)
201
  mx = np.max(sim) * 0.99
@@ -244,8 +244,8 @@ class Dealer:
244
  sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
245
  ins_tw = []
246
  for i in sres.ids:
247
- content_ltks = sres.field[i][cfield].split(" ")
248
- title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
249
  important_kwd = sres.field[i].get("important_kwd", [])
250
  tks = content_ltks + title_tks + important_kwd
251
  ins_tw.append(tks)
@@ -265,8 +265,8 @@ class Dealer:
265
  sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
266
  ins_tw = []
267
  for i in sres.ids:
268
- content_ltks = sres.field[i][cfield].split(" ")
269
- title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
270
  important_kwd = sres.field[i].get("important_kwd", [])
271
  tks = content_ltks + title_tks + important_kwd
272
  ins_tw.append(tks)
@@ -279,8 +279,8 @@ class Dealer:
279
  def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
280
  return self.qryr.hybrid_similarity(ans_embd,
281
  ins_embd,
282
- rag_tokenizer.tokenize(ans).split(" "),
283
- rag_tokenizer.tokenize(inst).split(" "))
284
 
285
  def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
286
  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
 
114
 
115
  for k in keywords:
116
  kwds.add(k)
117
+ for kk in rag_tokenizer.fine_grained_tokenize(k).split():
118
  if len(kk) < 2:
119
  continue
120
  if kk in kwds:
 
186
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
187
  len(ans_v[0]), len(chunk_v[0]))
188
 
189
+ chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
190
  for ck in chunks]
191
  cites = {}
192
  thr = 0.63
 
195
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
196
  chunk_v,
197
  rag_tokenizer.tokenize(
198
+ self.qryr.rmWWW(pieces_[i])).split(),
199
  chunks_tks,
200
  tkweight, vtweight)
201
  mx = np.max(sim) * 0.99
 
244
  sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
245
  ins_tw = []
246
  for i in sres.ids:
247
+ content_ltks = sres.field[i][cfield].split()
248
+ title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
249
  important_kwd = sres.field[i].get("important_kwd", [])
250
  tks = content_ltks + title_tks + important_kwd
251
  ins_tw.append(tks)
 
265
  sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
266
  ins_tw = []
267
  for i in sres.ids:
268
+ content_ltks = sres.field[i][cfield].split()
269
+ title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
270
  important_kwd = sres.field[i].get("important_kwd", [])
271
  tks = content_ltks + title_tks + important_kwd
272
  ins_tw.append(tks)
 
279
  def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
280
  return self.qryr.hybrid_similarity(ans_embd,
281
  ins_embd,
282
+ rag_tokenizer.tokenize(ans).split(),
283
+ rag_tokenizer.tokenize(inst).split())
284
 
285
  def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
286
  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
rag/nlp/term_weight.py CHANGED
@@ -99,7 +99,7 @@ class Dealer:
99
  txt = re.sub(p, r, txt)
100
 
101
  res = []
102
- for t in rag_tokenizer.tokenize(txt).split(" "):
103
  tk = t
104
  if (stpwd and tk in self.stop_words) or (
105
  re.match(r"[0-9]$", tk) and not num):
@@ -150,7 +150,7 @@ class Dealer:
150
 
151
  def split(self, txt):
152
  tks = []
153
- for t in re.sub(r"[ \t]+", " ", txt).split(" "):
154
  if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
155
  re.match(r".*[a-zA-Z]$", t) and tks and \
156
  self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
@@ -198,7 +198,7 @@ class Dealer:
198
  s = 0
199
 
200
  if not s and len(t) >= 4:
201
- s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
202
  if len(s) > 1:
203
  s = np.min([freq(tt) for tt in s]) / 6.
204
  else:
@@ -214,7 +214,7 @@ class Dealer:
214
  elif re.match(r"[a-z. -]+$", t):
215
  return 300
216
  elif len(t) >= 4:
217
- s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
218
  if len(s) > 1:
219
  return max(3, np.min([df(tt) for tt in s]) / 6.)
220
 
 
99
  txt = re.sub(p, r, txt)
100
 
101
  res = []
102
+ for t in rag_tokenizer.tokenize(txt).split():
103
  tk = t
104
  if (stpwd and tk in self.stop_words) or (
105
  re.match(r"[0-9]$", tk) and not num):
 
150
 
151
  def split(self, txt):
152
  tks = []
153
+ for t in re.sub(r"[ \t]+", " ", txt).split():
154
  if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
155
  re.match(r".*[a-zA-Z]$", t) and tks and \
156
  self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
 
198
  s = 0
199
 
200
  if not s and len(t) >= 4:
201
+ s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
202
  if len(s) > 1:
203
  s = np.min([freq(tt) for tt in s]) / 6.
204
  else:
 
214
  elif re.match(r"[a-z. -]+$", t):
215
  return 300
216
  elif len(t) >= 4:
217
+ s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
218
  if len(s) > 1:
219
  return max(3, np.min([df(tt) for tt in s]) / 6.)
220
 
rag/utils/es_conn.py CHANGED
@@ -85,6 +85,9 @@ class ESConnection(DocStoreConnection):
85
  logging.exception("ESConnection.createIndex error %s" % (indexName))
86
 
87
  def deleteIdx(self, indexName: str, knowledgebaseId: str):
 
 
 
88
  try:
89
  self.es.indices.delete(index=indexName, allow_no_indices=True)
90
  except NotFoundError:
@@ -400,7 +403,7 @@ class ESConnection(DocStoreConnection):
400
  if not hlts:
401
  continue
402
  txt = "...".join([a for a in list(hlts.items())[0][1]])
403
- if not is_english(txt.split(" ")):
404
  ans[d["_id"]] = txt
405
  continue
406
 
 
85
  logging.exception("ESConnection.createIndex error %s" % (indexName))
86
 
87
  def deleteIdx(self, indexName: str, knowledgebaseId: str):
88
+ if len(knowledgebaseId) > 0:
89
+ # The index need to be alive after any kb deletion since all kb under this tenant are in one index.
90
+ return
91
  try:
92
  self.es.indices.delete(index=indexName, allow_no_indices=True)
93
  except NotFoundError:
 
403
  if not hlts:
404
  continue
405
  txt = "...".join([a for a in list(hlts.items())[0][1]])
406
+ if not is_english(txt.split()):
407
  ans[d["_id"]] = txt
408
  continue
409
 
rag/utils/infinity_conn.py CHANGED
@@ -419,7 +419,7 @@ class InfinityConnection(DocStoreConnection):
419
  v = list(v)
420
  elif fieldnm == "important_kwd":
421
  assert isinstance(v, str)
422
- v = v.split(" ")
423
  else:
424
  if not isinstance(v, str):
425
  v = str(v)
 
419
  v = list(v)
420
  elif fieldnm == "important_kwd":
421
  assert isinstance(v, str)
422
+ v = v.split()
423
  else:
424
  if not isinstance(v, str):
425
  v = str(v)