KevinHuSh commited on
Commit
cfd6ece
·
1 Parent(s): dcce454

refine code (#595)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Refactoring

api/apps/chunk_app.py CHANGED
@@ -20,7 +20,7 @@ from flask_login import login_required, current_user
20
  from elasticsearch_dsl import Q
21
 
22
  from rag.app.qa import rmPrefix, beAdoc
23
- from rag.nlp import search, huqie
24
  from rag.utils.es_conn import ELASTICSEARCH
25
  from rag.utils import rmSpace
26
  from api.db import LLMType, ParserType
@@ -125,10 +125,10 @@ def set():
125
  d = {
126
  "id": req["chunk_id"],
127
  "content_with_weight": req["content_with_weight"]}
128
- d["content_ltks"] = huqie.qie(req["content_with_weight"])
129
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
130
  d["important_kwd"] = req["important_kwd"]
131
- d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
132
  if "available_int" in req:
133
  d["available_int"] = req["available_int"]
134
 
@@ -152,7 +152,7 @@ def set():
152
  retmsg="Q&A must be separated by TAB/ENTER key.")
153
  q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
154
  d = beAdoc(d, arr[0], arr[1], not any(
155
- [huqie.is_chinese(t) for t in q + a]))
156
 
157
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
158
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
@@ -202,11 +202,11 @@ def create():
202
  md5 = hashlib.md5()
203
  md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
204
  chunck_id = md5.hexdigest()
205
- d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
206
  "content_with_weight": req["content_with_weight"]}
207
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
208
  d["important_kwd"] = req.get("important_kwd", [])
209
- d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
210
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
211
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
212
 
 
20
  from elasticsearch_dsl import Q
21
 
22
  from rag.app.qa import rmPrefix, beAdoc
23
+ from rag.nlp import search, rag_tokenizer
24
  from rag.utils.es_conn import ELASTICSEARCH
25
  from rag.utils import rmSpace
26
  from api.db import LLMType, ParserType
 
125
  d = {
126
  "id": req["chunk_id"],
127
  "content_with_weight": req["content_with_weight"]}
128
+ d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
129
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
130
  d["important_kwd"] = req["important_kwd"]
131
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
132
  if "available_int" in req:
133
  d["available_int"] = req["available_int"]
134
 
 
152
  retmsg="Q&A must be separated by TAB/ENTER key.")
153
  q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
154
  d = beAdoc(d, arr[0], arr[1], not any(
155
+ [rag_tokenizer.is_chinese(t) for t in q + a]))
156
 
157
  v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
158
  v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
 
202
  md5 = hashlib.md5()
203
  md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
204
  chunck_id = md5.hexdigest()
205
+ d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
206
  "content_with_weight": req["content_with_weight"]}
207
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
208
  d["important_kwd"] = req.get("important_kwd", [])
209
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
210
  d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
211
  d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
212
 
api/db/services/task_service.py CHANGED
@@ -78,14 +78,13 @@ class TaskService(CommonService):
78
  docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
79
  .join(Document, on=(cls.model.doc_id == Document.id)) \
80
  .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
81
- .join(File, on=(File2Document.file_id == File.id)) \
82
  .where(
83
  Document.status == StatusEnum.VALID.value,
84
  Document.run == TaskStatus.RUNNING.value,
85
  ~(Document.type == FileType.VIRTUAL.value),
86
- cls.model.progress >= 0,
87
  cls.model.progress < 1,
88
- cls.model.create_time >= current_timestamp() - 180000
89
  )
90
  docs = list(docs.dicts())
91
  if not docs: return []
 
78
  docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
79
  .join(Document, on=(cls.model.doc_id == Document.id)) \
80
  .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
81
+ .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
82
  .where(
83
  Document.status == StatusEnum.VALID.value,
84
  Document.run == TaskStatus.RUNNING.value,
85
  ~(Document.type == FileType.VIRTUAL.value),
 
86
  cls.model.progress < 1,
87
+ cls.model.create_time >= current_timestamp() - 1000 * 600
88
  )
89
  docs = list(docs.dicts())
90
  if not docs: return []
deepdoc/parser/docx_parser.py CHANGED
@@ -3,7 +3,7 @@ from docx import Document
3
  import re
4
  import pandas as pd
5
  from collections import Counter
6
- from rag.nlp import huqie
7
  from io import BytesIO
8
 
9
 
@@ -35,14 +35,14 @@ class RAGFlowDocxParser:
35
  for p, n in patt:
36
  if re.search(p, b):
37
  return n
38
- tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
39
  if len(tks) > 3:
40
  if len(tks) < 12:
41
  return "Tx"
42
  else:
43
  return "Lx"
44
 
45
- if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
46
  return "Nr"
47
 
48
  return "Ot"
 
3
  import re
4
  import pandas as pd
5
  from collections import Counter
6
+ from rag.nlp import rag_tokenizer
7
  from io import BytesIO
8
 
9
 
 
35
  for p, n in patt:
36
  if re.search(p, b):
37
  return n
38
+ tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
39
  if len(tks) > 3:
40
  if len(tks) < 12:
41
  return "Tx"
42
  else:
43
  return "Lx"
44
 
45
+ if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
46
  return "Nr"
47
 
48
  return "Ot"
deepdoc/parser/pdf_parser.py CHANGED
@@ -16,7 +16,7 @@ from PyPDF2 import PdfReader as pdf2_read
16
 
17
  from api.utils.file_utils import get_project_base_directory
18
  from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
19
- from rag.nlp import huqie
20
  from copy import deepcopy
21
  from huggingface_hub import snapshot_download
22
 
@@ -95,13 +95,13 @@ class RAGFlowPdfParser:
95
  h = max(self.__height(up), self.__height(down))
96
  y_dis = self._y_dis(up, down)
97
  LEN = 6
98
- tks_down = huqie.qie(down["text"][:LEN]).split(" ")
99
- tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
100
  tks_all = up["text"][-LEN:].strip() \
101
  + (" " if re.match(r"[a-zA-Z0-9]+",
102
  up["text"][-1] + down["text"][0]) else "") \
103
  + down["text"][:LEN].strip()
104
- tks_all = huqie.qie(tks_all).split(" ")
105
  fea = [
106
  up.get("R", -1) == down.get("R", -1),
107
  y_dis / h,
@@ -142,8 +142,8 @@ class RAGFlowPdfParser:
142
  tks_down[-1] == tks_up[-1],
143
  max(down["in_row"], up["in_row"]),
144
  abs(down["in_row"] - up["in_row"]),
145
- len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
146
- len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
147
  ]
148
  return fea
149
 
@@ -599,7 +599,7 @@ class RAGFlowPdfParser:
599
 
600
  if b["text"].strip()[0] != b_["text"].strip()[0] \
601
  or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
602
- or huqie.is_chinese(b["text"].strip()[0]) \
603
  or b["top"] > b_["bottom"]:
604
  i += 1
605
  continue
 
16
 
17
  from api.utils.file_utils import get_project_base_directory
18
  from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
19
+ from rag.nlp import rag_tokenizer
20
  from copy import deepcopy
21
  from huggingface_hub import snapshot_download
22
 
 
95
  h = max(self.__height(up), self.__height(down))
96
  y_dis = self._y_dis(up, down)
97
  LEN = 6
98
+ tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
99
+ tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
100
  tks_all = up["text"][-LEN:].strip() \
101
  + (" " if re.match(r"[a-zA-Z0-9]+",
102
  up["text"][-1] + down["text"][0]) else "") \
103
  + down["text"][:LEN].strip()
104
+ tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
105
  fea = [
106
  up.get("R", -1) == down.get("R", -1),
107
  y_dis / h,
 
142
  tks_down[-1] == tks_up[-1],
143
  max(down["in_row"], up["in_row"]),
144
  abs(down["in_row"] - up["in_row"]),
145
+ len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
146
+ len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
147
  ]
148
  return fea
149
 
 
599
 
600
  if b["text"].strip()[0] != b_["text"].strip()[0] \
601
  or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
602
+ or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
603
  or b["top"] > b_["bottom"]:
604
  i += 1
605
  continue
deepdoc/parser/resume/entities/corporations.py CHANGED
@@ -1,6 +1,6 @@
1
  import re,json,os
2
  import pandas as pd
3
- from rag.nlp import huqie
4
  from . import regions
5
  current_file_path = os.path.dirname(os.path.abspath(__file__))
6
  GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
@@ -22,14 +22,14 @@ def baike(cid, default_v=0):
22
  def corpNorm(nm, add_region=True):
23
  global CORP_TKS
24
  if not nm or type(nm)!=type(""):return ""
25
- nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
26
  nm = re.sub(r"&amp;", "&", nm)
27
  nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
28
  nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
29
  nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
30
  if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
31
 
32
- tks = huqie.qie(nm).split(" ")
33
  reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
34
  nm = ""
35
  for t in tks:
 
1
  import re,json,os
2
  import pandas as pd
3
+ from rag.nlp import rag_tokenizer
4
  from . import regions
5
  current_file_path = os.path.dirname(os.path.abspath(__file__))
6
  GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
 
22
  def corpNorm(nm, add_region=True):
23
  global CORP_TKS
24
  if not nm or type(nm)!=type(""):return ""
25
+ nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
26
  nm = re.sub(r"&amp;", "&", nm)
27
  nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
28
  nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
29
  nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
30
  if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
31
 
32
+ tks = rag_tokenizer.tokenize(nm).split(" ")
33
  reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
34
  nm = ""
35
  for t in tks:
deepdoc/parser/resume/step_two.py CHANGED
@@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \
3
  traceback, signal
4
  import numpy as np
5
  from deepdoc.parser.resume.entities import degrees, schools, corporations
6
- from rag.nlp import huqie, surname
7
  from xpinyin import Pinyin
8
  from contextlib import contextmanager
9
 
@@ -83,7 +83,7 @@ def forEdu(cv):
83
  if n.get("school_name") and isinstance(n["school_name"], str):
84
  sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
85
  e["sch_nm_kwd"] = sch[-1]
86
- fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
87
 
88
  if n.get("discipline_name") and isinstance(n["discipline_name"], str):
89
  maj.append(n["discipline_name"])
@@ -166,10 +166,10 @@ def forEdu(cv):
166
  if "tag_kwd" not in cv: cv["tag_kwd"] = []
167
  if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
168
 
169
- if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
170
- if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
171
- if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
172
- if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
173
 
174
  return cv
175
 
@@ -187,11 +187,11 @@ def forProj(cv):
187
  if n.get("achivement"): desc.append(str(n["achivement"]))
188
 
189
  if pro_nms:
190
- # cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
191
- cv["project_name_tks"] = huqie.qie(pro_nms[0])
192
  if desc:
193
- cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
194
- cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
195
 
196
  return cv
197
 
@@ -280,25 +280,25 @@ def forWork(cv):
280
  if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
281
 
282
  if fea["position_name"]:
283
- cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
284
- cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
285
- cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
286
 
287
  if fea["industry_name"]:
288
- cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
289
- cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
290
- cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
291
 
292
  if fea["corporation_name"]:
293
  cv["corporation_name_kwd"] = fea["corporation_name"][0]
294
  cv["corp_nm_kwd"] = fea["corporation_name"]
295
- cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
296
- cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
297
- cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
298
 
299
  if fea["responsibilities"]:
300
- cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
301
- cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
302
 
303
  if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
304
  re.match(r"[^0-9]+$", str(i))]
@@ -444,15 +444,15 @@ def parse(cv):
444
  if nms:
445
  t = k[:-4]
446
  cv[f"{t}_kwd"] = nms
447
- cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
448
  except Exception as e:
449
  print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
450
  cv[k] = []
451
 
452
  # tokenize fields
453
  if k in tks_fld:
454
- cv[f"{k}_tks"] = huqie.qie(cv[k])
455
- if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
456
 
457
  # keyword fields
458
  if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
@@ -492,7 +492,7 @@ def parse(cv):
492
  cv["name_kwd"] = name
493
  cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
494
  cv["name_tks"] = (
495
- huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
496
  ) if name else ""
497
  else:
498
  cv["integerity_flt"] /= 2.
@@ -515,7 +515,7 @@ def parse(cv):
515
  cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
516
  # long text tokenize
517
 
518
- if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
519
 
520
  # for yes or no field
521
  fea = []
 
3
  traceback, signal
4
  import numpy as np
5
  from deepdoc.parser.resume.entities import degrees, schools, corporations
6
+ from rag.nlp import rag_tokenizer, surname
7
  from xpinyin import Pinyin
8
  from contextlib import contextmanager
9
 
 
83
  if n.get("school_name") and isinstance(n["school_name"], str):
84
  sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
85
  e["sch_nm_kwd"] = sch[-1]
86
+ fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
87
 
88
  if n.get("discipline_name") and isinstance(n["discipline_name"], str):
89
  maj.append(n["discipline_name"])
 
166
  if "tag_kwd" not in cv: cv["tag_kwd"] = []
167
  if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
168
 
169
+ if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
170
+ if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
171
+ if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
172
+ if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
173
 
174
  return cv
175
 
 
187
  if n.get("achivement"): desc.append(str(n["achivement"]))
188
 
189
  if pro_nms:
190
+ # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
191
+ cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
192
  if desc:
193
+ cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
194
+ cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
195
 
196
  return cv
197
 
 
280
  if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
281
 
282
  if fea["position_name"]:
283
+ cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
284
+ cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
285
+ cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
286
 
287
  if fea["industry_name"]:
288
+ cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
289
+ cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
290
+ cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
291
 
292
  if fea["corporation_name"]:
293
  cv["corporation_name_kwd"] = fea["corporation_name"][0]
294
  cv["corp_nm_kwd"] = fea["corporation_name"]
295
+ cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
296
+ cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
297
+ cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
298
 
299
  if fea["responsibilities"]:
300
+ cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
301
+ cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
302
 
303
  if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
304
  re.match(r"[^0-9]+$", str(i))]
 
444
  if nms:
445
  t = k[:-4]
446
  cv[f"{t}_kwd"] = nms
447
+ cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
448
  except Exception as e:
449
  print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
450
  cv[k] = []
451
 
452
  # tokenize fields
453
  if k in tks_fld:
454
+ cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
455
+ if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
456
 
457
  # keyword fields
458
  if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
 
492
  cv["name_kwd"] = name
493
  cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
494
  cv["name_tks"] = (
495
+ rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
496
  ) if name else ""
497
  else:
498
  cv["integerity_flt"] /= 2.
 
515
  cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
516
  # long text tokenize
517
 
518
+ if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
519
 
520
  # for yes or no field
521
  fea = []
deepdoc/vision/table_structure_recognizer.py CHANGED
@@ -19,7 +19,7 @@ import numpy as np
19
  from huggingface_hub import snapshot_download
20
 
21
  from api.utils.file_utils import get_project_base_directory
22
- from rag.nlp import huqie
23
  from .recognizer import Recognizer
24
 
25
 
@@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer):
117
  for p, n in patt:
118
  if re.search(p, b["text"].strip()):
119
  return n
120
- tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
121
  if len(tks) > 3:
122
  if len(tks) < 12:
123
  return "Tx"
124
  else:
125
  return "Lx"
126
 
127
- if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
128
  return "Nr"
129
 
130
  return "Ot"
 
19
  from huggingface_hub import snapshot_download
20
 
21
  from api.utils.file_utils import get_project_base_directory
22
+ from rag.nlp import rag_tokenizer
23
  from .recognizer import Recognizer
24
 
25
 
 
117
  for p, n in patt:
118
  if re.search(p, b["text"].strip()):
119
  return n
120
+ tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
121
  if len(tks) > 3:
122
  if len(tks) < 12:
123
  return "Tx"
124
  else:
125
  return "Lx"
126
 
127
+ if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
128
  return "Nr"
129
 
130
  return "Ot"
rag/app/book.py CHANGED
@@ -18,7 +18,7 @@ from io import BytesIO
18
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
19
  hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
20
  tokenize_chunks, find_codec
21
- from rag.nlp import huqie
22
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
23
 
24
 
@@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
63
  """
64
  doc = {
65
  "docnm_kwd": filename,
66
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
67
  }
68
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
69
  pdf_parser = None
70
  sections, tbls = [], []
71
  if re.search(r"\.docx$", filename, re.IGNORECASE):
 
18
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
19
  hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
20
  tokenize_chunks, find_codec
21
+ from rag.nlp import rag_tokenizer
22
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
23
 
24
 
 
63
  """
64
  doc = {
65
  "docnm_kwd": filename,
66
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
67
  }
68
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
69
  pdf_parser = None
70
  sections, tbls = [], []
71
  if re.search(r"\.docx$", filename, re.IGNORECASE):
rag/app/laws.py CHANGED
@@ -19,7 +19,7 @@ from docx import Document
19
  from api.db import ParserType
20
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
21
  make_colon_as_title, add_positions, tokenize_chunks, find_codec
22
- from rag.nlp import huqie
23
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
24
  from rag.settings import cron_logger
25
 
@@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
89
  """
90
  doc = {
91
  "docnm_kwd": filename,
92
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
93
  }
94
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
95
  pdf_parser = None
96
  sections = []
97
  if re.search(r"\.docx$", filename, re.IGNORECASE):
 
19
  from api.db import ParserType
20
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
21
  make_colon_as_title, add_positions, tokenize_chunks, find_codec
22
+ from rag.nlp import rag_tokenizer
23
  from deepdoc.parser import PdfParser, DocxParser, PlainParser
24
  from rag.settings import cron_logger
25
 
 
89
  """
90
  doc = {
91
  "docnm_kwd": filename,
92
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
93
  }
94
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
95
  pdf_parser = None
96
  sections = []
97
  if re.search(r"\.docx$", filename, re.IGNORECASE):
rag/app/manual.py CHANGED
@@ -2,7 +2,7 @@ import copy
2
  import re
3
 
4
  from api.db import ParserType
5
- from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
6
  from deepdoc.parser import PdfParser, PlainParser
7
  from rag.utils import num_tokens_from_string
8
 
@@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
70
  doc = {
71
  "docnm_kwd": filename
72
  }
73
- doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
74
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
75
  # is it English
76
  eng = lang.lower() == "english" # pdf_parser.is_english
77
 
 
2
  import re
3
 
4
  from api.db import ParserType
5
+ from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
6
  from deepdoc.parser import PdfParser, PlainParser
7
  from rag.utils import num_tokens_from_string
8
 
 
70
  doc = {
71
  "docnm_kwd": filename
72
  }
73
+ doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
74
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
75
  # is it English
76
  eng = lang.lower() == "english" # pdf_parser.is_english
77
 
rag/app/naive.py CHANGED
@@ -16,7 +16,7 @@ from docx import Document
16
  from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
- from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
21
  from rag.settings import cron_logger
22
 
@@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
112
  "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
113
  doc = {
114
  "docnm_kwd": filename,
115
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
116
  }
117
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
118
  res = []
119
  pdf_parser = None
120
  sections = []
 
16
  from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
+ from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
21
  from rag.settings import cron_logger
22
 
 
112
  "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
113
  doc = {
114
  "docnm_kwd": filename,
115
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
116
  }
117
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
118
  res = []
119
  pdf_parser = None
120
  sections = []
rag/app/one.py CHANGED
@@ -14,7 +14,7 @@ from tika import parser
14
  from io import BytesIO
15
  import re
16
  from rag.app import laws
17
- from rag.nlp import huqie, tokenize, find_codec
18
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser
19
 
20
 
@@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
111
 
112
  doc = {
113
  "docnm_kwd": filename,
114
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
115
  }
116
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
117
  tokenize(doc, "\n".join(sections), eng)
118
  return [doc]
119
 
 
14
  from io import BytesIO
15
  import re
16
  from rag.app import laws
17
+ from rag.nlp import rag_tokenizer, tokenize, find_codec
18
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser
19
 
20
 
 
111
 
112
  doc = {
113
  "docnm_kwd": filename,
114
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
115
  }
116
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
117
  tokenize(doc, "\n".join(sections), eng)
118
  return [doc]
119
 
rag/app/paper.py CHANGED
@@ -15,7 +15,7 @@ import re
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
- from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
19
  from deepdoc.parser import PdfParser, PlainParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
@@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
153
  else:
154
  raise NotImplementedError("file type not supported yet(pdf supported)")
155
 
156
- doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
157
- "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
158
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
159
- doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
160
  # is it English
161
  eng = lang.lower() == "english" # pdf_parser.is_english
162
  print("It's English.....", eng)
 
15
  from collections import Counter
16
 
17
  from api.db import ParserType
18
+ from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
19
  from deepdoc.parser import PdfParser, PlainParser
20
  import numpy as np
21
  from rag.utils import num_tokens_from_string
 
153
  else:
154
  raise NotImplementedError("file type not supported yet(pdf supported)")
155
 
156
+ doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
157
+ "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
158
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
159
+ doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
160
  # is it English
161
  eng = lang.lower() == "english" # pdf_parser.is_english
162
  print("It's English.....", eng)
rag/app/presentation.py CHANGED
@@ -17,7 +17,7 @@ from io import BytesIO
17
  from PIL import Image
18
 
19
  from rag.nlp import tokenize, is_english
20
- from rag.nlp import huqie
21
  from deepdoc.parser import PdfParser, PptParser, PlainParser
22
  from PyPDF2 import PdfReader as pdf2_read
23
 
@@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
96
  eng = lang.lower() == "english"
97
  doc = {
98
  "docnm_kwd": filename,
99
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
100
  }
101
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
102
  res = []
103
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
104
  ppt_parser = Ppt()
 
17
  from PIL import Image
18
 
19
  from rag.nlp import tokenize, is_english
20
+ from rag.nlp import rag_tokenizer
21
  from deepdoc.parser import PdfParser, PptParser, PlainParser
22
  from PyPDF2 import PdfReader as pdf2_read
23
 
 
96
  eng = lang.lower() == "english"
97
  doc = {
98
  "docnm_kwd": filename,
99
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
100
  }
101
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
102
  res = []
103
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
104
  ppt_parser = Ppt()
rag/app/qa.py CHANGED
@@ -16,7 +16,7 @@ from io import BytesIO
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
18
  from rag.nlp import is_english, random_choices, find_codec
19
- from rag.nlp import huqie
20
  from deepdoc.parser import ExcelParser
21
 
22
 
@@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
73
  aprefix = "Answer: " if eng else "回答:"
74
  d["content_with_weight"] = "\t".join(
75
  [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
76
- d["content_ltks"] = huqie.qie(q)
77
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
78
  return d
79
 
80
 
@@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
94
  res = []
95
  doc = {
96
  "docnm_kwd": filename,
97
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
98
  }
99
  if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
100
  callback(0.1, "Start to parse.")
 
16
  from nltk import word_tokenize
17
  from openpyxl import load_workbook
18
  from rag.nlp import is_english, random_choices, find_codec
19
+ from rag.nlp import rag_tokenizer
20
  from deepdoc.parser import ExcelParser
21
 
22
 
 
73
  aprefix = "Answer: " if eng else "回答:"
74
  d["content_with_weight"] = "\t".join(
75
  [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
76
+ d["content_ltks"] = rag_tokenizer.tokenize(q)
77
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
78
  return d
79
 
80
 
 
94
  res = []
95
  doc = {
96
  "docnm_kwd": filename,
97
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
98
  }
99
  if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
100
  callback(0.1, "Start to parse.")
rag/app/resume.py CHANGED
@@ -18,7 +18,7 @@ import re
18
  import pandas as pd
19
  import requests
20
  from api.db.services.knowledgebase_service import KnowledgebaseService
21
- from rag.nlp import huqie
22
  from deepdoc.parser.resume import refactor
23
  from deepdoc.parser.resume import step_one, step_two
24
  from rag.settings import cron_logger
@@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs):
131
  titles.append(str(v))
132
  doc = {
133
  "docnm_kwd": filename,
134
- "title_tks": huqie.qie("-".join(titles) + "-简历")
135
  }
136
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
137
  pairs = []
138
  for n, m in field_map.items():
139
  if not resume.get(n):
@@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs):
147
 
148
  doc["content_with_weight"] = "\n".join(
149
  ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
150
- doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
151
- doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
152
  for n, _ in field_map.items():
153
  if n not in resume:
154
  continue
@@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
156
  len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
157
  resume[n] = resume[n][0]
158
  if n.find("_tks") > 0:
159
- resume[n] = huqie.qieqie(resume[n])
160
  doc[n] = resume[n]
161
 
162
  print(doc)
 
18
  import pandas as pd
19
  import requests
20
  from api.db.services.knowledgebase_service import KnowledgebaseService
21
+ from rag.nlp import rag_tokenizer
22
  from deepdoc.parser.resume import refactor
23
  from deepdoc.parser.resume import step_one, step_two
24
  from rag.settings import cron_logger
 
131
  titles.append(str(v))
132
  doc = {
133
  "docnm_kwd": filename,
134
+ "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
135
  }
136
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
137
  pairs = []
138
  for n, m in field_map.items():
139
  if not resume.get(n):
 
147
 
148
  doc["content_with_weight"] = "\n".join(
149
  ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
150
+ doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
151
+ doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
152
  for n, _ in field_map.items():
153
  if n not in resume:
154
  continue
 
156
  len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
157
  resume[n] = resume[n][0]
158
  if n.find("_tks") > 0:
159
+ resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
160
  doc[n] = resume[n]
161
 
162
  print(doc)
rag/app/table.py CHANGED
@@ -20,7 +20,7 @@ from openpyxl import load_workbook
20
  from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
- from rag.nlp import huqie, is_english, tokenize, find_codec
24
  from deepdoc.parser import ExcelParser
25
 
26
 
@@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
216
  for ii, row in df.iterrows():
217
  d = {
218
  "docnm_kwd": filename,
219
- "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
220
  }
221
  row_txt = []
222
  for j in range(len(clmns)):
@@ -227,7 +227,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
227
  if pd.isna(row[clmns[j]]):
228
  continue
229
  fld = clmns_map[j][0]
230
- d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
231
  row[clmns[j]])
232
  row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
233
  if not row_txt:
 
20
  from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
+ from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
24
  from deepdoc.parser import ExcelParser
25
 
26
 
 
216
  for ii, row in df.iterrows():
217
  d = {
218
  "docnm_kwd": filename,
219
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
220
  }
221
  row_txt = []
222
  for j in range(len(clmns)):
 
227
  if pd.isna(row[clmns[j]]):
228
  continue
229
  fld = clmns_map[j][0]
230
+ d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
231
  row[clmns[j]])
232
  row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
233
  if not row_txt:
rag/nlp/__init__.py CHANGED
@@ -2,7 +2,7 @@ import random
2
  from collections import Counter
3
 
4
  from rag.utils import num_tokens_from_string
5
- from . import huqie
6
  import re
7
  import copy
8
 
@@ -109,8 +109,8 @@ def is_english(texts):
109
  def tokenize(d, t, eng):
110
  d["content_with_weight"] = t
111
  t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
112
- d["content_ltks"] = huqie.qie(t)
113
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
114
 
115
 
116
  def tokenize_chunks(chunks, doc, eng, pdf_parser):
 
2
  from collections import Counter
3
 
4
  from rag.utils import num_tokens_from_string
5
+ from . import rag_tokenizer
6
  import re
7
  import copy
8
 
 
109
  def tokenize(d, t, eng):
110
  d["content_with_weight"] = t
111
  t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
112
+ d["content_ltks"] = rag_tokenizer.tokenize(t)
113
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
114
 
115
 
116
  def tokenize_chunks(chunks, doc, eng, pdf_parser):
rag/nlp/query.py CHANGED
@@ -7,7 +7,7 @@ import logging
7
  import copy
8
  from elasticsearch_dsl import Q
9
 
10
- from rag.nlp import huqie, term_weight, synonym
11
 
12
 
13
  class EsQueryer:
@@ -47,13 +47,13 @@ class EsQueryer:
47
  txt = re.sub(
48
  r"[ \r\n\t,,。??/`!!&]+",
49
  " ",
50
- huqie.tradi2simp(
51
- huqie.strQ2B(
52
  txt.lower()))).strip()
53
  txt = EsQueryer.rmWWW(txt)
54
 
55
  if not self.isChinese(txt):
56
- tks = huqie.qie(txt).split(" ")
57
  q = copy.deepcopy(tks)
58
  for i in range(1, len(tks)):
59
  q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@@ -65,7 +65,7 @@ class EsQueryer:
65
  boost=1)#, minimum_should_match=min_match)
66
  ), tks
67
 
68
- def needQieqie(tk):
69
  if len(tk) < 4:
70
  return False
71
  if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
@@ -81,7 +81,7 @@ class EsQueryer:
81
  logging.info(json.dumps(twts, ensure_ascii=False))
82
  tms = []
83
  for tk, w in sorted(twts, key=lambda x: x[1] * -1):
84
- sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else []
85
  sm = [
86
  re.sub(
87
  r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
@@ -110,10 +110,10 @@ class EsQueryer:
110
  if len(twts) > 1:
111
  tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
112
  if re.match(r"[0-9a-z ]+$", tt):
113
- tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt)
114
 
115
  syns = " OR ".join(
116
- ["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns])
117
  if syns:
118
  tms = f"({tms})^5 OR ({syns})^0.7"
119
 
 
7
  import copy
8
  from elasticsearch_dsl import Q
9
 
10
+ from rag.nlp import rag_tokenizer, term_weight, synonym
11
 
12
 
13
  class EsQueryer:
 
47
  txt = re.sub(
48
  r"[ \r\n\t,,。??/`!!&]+",
49
  " ",
50
+ rag_tokenizer.tradi2simp(
51
+ rag_tokenizer.strQ2B(
52
  txt.lower()))).strip()
53
  txt = EsQueryer.rmWWW(txt)
54
 
55
  if not self.isChinese(txt):
56
+ tks = rag_tokenizer.tokenize(txt).split(" ")
57
  q = copy.deepcopy(tks)
58
  for i in range(1, len(tks)):
59
  q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
 
65
  boost=1)#, minimum_should_match=min_match)
66
  ), tks
67
 
68
+ def need_fine_grained_tokenize(tk):
69
  if len(tk) < 4:
70
  return False
71
  if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
 
81
  logging.info(json.dumps(twts, ensure_ascii=False))
82
  tms = []
83
  for tk, w in sorted(twts, key=lambda x: x[1] * -1):
84
+ sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
85
  sm = [
86
  re.sub(
87
  r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
 
110
  if len(twts) > 1:
111
  tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
112
  if re.match(r"[0-9a-z ]+$", tt):
113
+ tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)
114
 
115
  syns = " OR ".join(
116
+ ["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
117
  if syns:
118
  tms = f"({tms})^5 OR ({syns})^0.7"
119
 
rag/nlp/rag_tokenizer.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import copy
4
+ import datrie
5
+ import math
6
+ import os
7
+ import re
8
+ import string
9
+ import sys
10
+ from hanziconv import HanziConv
11
+ from huggingface_hub import snapshot_download
12
+ from nltk import word_tokenize
13
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
14
+ from api.utils.file_utils import get_project_base_directory
15
+
16
+
17
+ class RagTokenizer:
18
+ def key_(self, line):
19
+ return str(line.lower().encode("utf-8"))[2:-1]
20
+
21
+ def rkey_(self, line):
22
+ return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
23
+
24
+ def loadDict_(self, fnm):
25
+ print("[HUQIE]:Build trie", fnm, file=sys.stderr)
26
+ try:
27
+ of = open(fnm, "r")
28
+ while True:
29
+ line = of.readline()
30
+ if not line:
31
+ break
32
+ line = re.sub(r"[\r\n]+", "", line)
33
+ line = re.split(r"[ \t]", line)
34
+ k = self.key_(line[0])
35
+ F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
36
+ if k not in self.trie_ or self.trie_[k][0] < F:
37
+ self.trie_[self.key_(line[0])] = (F, line[2])
38
+ self.trie_[self.rkey_(line[0])] = 1
39
+ self.trie_.save(fnm + ".trie")
40
+ of.close()
41
+ except Exception as e:
42
+ print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
43
+
44
+ def __init__(self, debug=False):
45
+ self.DEBUG = debug
46
+ self.DENOMINATOR = 1000000
47
+ self.trie_ = datrie.Trie(string.printable)
48
+ self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
49
+
50
+ self.stemmer = PorterStemmer()
51
+ self.lemmatizer = WordNetLemmatizer()
52
+
53
+ self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
54
+ try:
55
+ self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
56
+ return
57
+ except Exception as e:
58
+ print("[HUQIE]:Build default trie", file=sys.stderr)
59
+ self.trie_ = datrie.Trie(string.printable)
60
+
61
+ self.loadDict_(self.DIR_ + ".txt")
62
+
63
+ def loadUserDict(self, fnm):
64
+ try:
65
+ self.trie_ = datrie.Trie.load(fnm + ".trie")
66
+ return
67
+ except Exception as e:
68
+ self.trie_ = datrie.Trie(string.printable)
69
+ self.loadDict_(fnm)
70
+
71
+ def addUserDict(self, fnm):
72
+ self.loadDict_(fnm)
73
+
74
+ def _strQ2B(self, ustring):
75
+ """把字符串全角转半角"""
76
+ rstring = ""
77
+ for uchar in ustring:
78
+ inside_code = ord(uchar)
79
+ if inside_code == 0x3000:
80
+ inside_code = 0x0020
81
+ else:
82
+ inside_code -= 0xfee0
83
+ if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
84
+ rstring += uchar
85
+ else:
86
+ rstring += chr(inside_code)
87
+ return rstring
88
+
89
+ def _tradi2simp(self, line):
90
+ return HanziConv.toSimplified(line)
91
+
92
+ def dfs_(self, chars, s, preTks, tkslist):
93
+ MAX_L = 10
94
+ res = s
95
+ # if s > MAX_L or s>= len(chars):
96
+ if s >= len(chars):
97
+ tkslist.append(preTks)
98
+ return res
99
+
100
+ # pruning
101
+ S = s + 1
102
+ if s + 2 <= len(chars):
103
+ t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
104
+ if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
105
+ self.key_(t2)):
106
+ S = s + 2
107
+ if len(preTks) > 2 and len(
108
+ preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
109
+ t1 = preTks[-1][0] + "".join(chars[s:s + 1])
110
+ if self.trie_.has_keys_with_prefix(self.key_(t1)):
111
+ S = s + 2
112
+
113
+ ################
114
+ for e in range(S, len(chars) + 1):
115
+ t = "".join(chars[s:e])
116
+ k = self.key_(t)
117
+
118
+ if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
119
+ break
120
+
121
+ if k in self.trie_:
122
+ pretks = copy.deepcopy(preTks)
123
+ if k in self.trie_:
124
+ pretks.append((t, self.trie_[k]))
125
+ else:
126
+ pretks.append((t, (-12, '')))
127
+ res = max(res, self.dfs_(chars, e, pretks, tkslist))
128
+
129
+ if res > s:
130
+ return res
131
+
132
+ t = "".join(chars[s:s + 1])
133
+ k = self.key_(t)
134
+ if k in self.trie_:
135
+ preTks.append((t, self.trie_[k]))
136
+ else:
137
+ preTks.append((t, (-12, '')))
138
+
139
+ return self.dfs_(chars, s + 1, preTks, tkslist)
140
+
141
+ def freq(self, tk):
142
+ k = self.key_(tk)
143
+ if k not in self.trie_:
144
+ return 0
145
+ return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
146
+
147
+ def tag(self, tk):
148
+ k = self.key_(tk)
149
+ if k not in self.trie_:
150
+ return ""
151
+ return self.trie_[k][1]
152
+
153
+ def score_(self, tfts):
154
+ B = 30
155
+ F, L, tks = 0, 0, []
156
+ for tk, (freq, tag) in tfts:
157
+ F += freq
158
+ L += 0 if len(tk) < 2 else 1
159
+ tks.append(tk)
160
+ F /= len(tks)
161
+ L /= len(tks)
162
+ if self.DEBUG:
163
+ print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
164
+ return tks, B / len(tks) + L + F
165
+
166
+ def sortTks_(self, tkslist):
167
+ res = []
168
+ for tfts in tkslist:
169
+ tks, s = self.score_(tfts)
170
+ res.append((tks, s))
171
+ return sorted(res, key=lambda x: x[1], reverse=True)
172
+
173
+ def merge_(self, tks):
174
+ patts = [
175
+ (r"[ ]+", " "),
176
+ (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
177
+ ]
178
+ # for p,s in patts: tks = re.sub(p, s, tks)
179
+
180
+ # if split chars is part of token
181
+ res = []
182
+ tks = re.sub(r"[ ]+", " ", tks).split(" ")
183
+ s = 0
184
+ while True:
185
+ if s >= len(tks):
186
+ break
187
+ E = s + 1
188
+ for e in range(s + 2, min(len(tks) + 2, s + 6)):
189
+ tk = "".join(tks[s:e])
190
+ if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
191
+ E = e
192
+ res.append("".join(tks[s:E]))
193
+ s = E
194
+
195
+ return " ".join(res)
196
+
197
+ def maxForward_(self, line):
198
+ res = []
199
+ s = 0
200
+ while s < len(line):
201
+ e = s + 1
202
+ t = line[s:e]
203
+ while e < len(line) and self.trie_.has_keys_with_prefix(
204
+ self.key_(t)):
205
+ e += 1
206
+ t = line[s:e]
207
+
208
+ while e - 1 > s and self.key_(t) not in self.trie_:
209
+ e -= 1
210
+ t = line[s:e]
211
+
212
+ if self.key_(t) in self.trie_:
213
+ res.append((t, self.trie_[self.key_(t)]))
214
+ else:
215
+ res.append((t, (0, '')))
216
+
217
+ s = e
218
+
219
+ return self.score_(res)
220
+
221
+ def maxBackward_(self, line):
222
+ res = []
223
+ s = len(line) - 1
224
+ while s >= 0:
225
+ e = s + 1
226
+ t = line[s:e]
227
+ while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
228
+ s -= 1
229
+ t = line[s:e]
230
+
231
+ while s + 1 < e and self.key_(t) not in self.trie_:
232
+ s += 1
233
+ t = line[s:e]
234
+
235
+ if self.key_(t) in self.trie_:
236
+ res.append((t, self.trie_[self.key_(t)]))
237
+ else:
238
+ res.append((t, (0, '')))
239
+
240
+ s -= 1
241
+
242
+ return self.score_(res[::-1])
243
+
244
+ def tokenize(self, line):
245
+ line = self._strQ2B(line).lower()
246
+ line = self._tradi2simp(line)
247
+ zh_num = len([1 for c in line if is_chinese(c)])
248
+ if zh_num < len(line) * 0.2:
249
+ return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
250
+
251
+ arr = re.split(self.SPLIT_CHAR, line)
252
+ res = []
253
+ for L in arr:
254
+ if len(L) < 2 or re.match(
255
+ r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
256
+ res.append(L)
257
+ continue
258
+ # print(L)
259
+
260
+ # use maxforward for the first time
261
+ tks, s = self.maxForward_(L)
262
+ tks1, s1 = self.maxBackward_(L)
263
+ if self.DEBUG:
264
+ print("[FW]", tks, s)
265
+ print("[BW]", tks1, s1)
266
+
267
+ diff = [0 for _ in range(max(len(tks1), len(tks)))]
268
+ for i in range(min(len(tks1), len(tks))):
269
+ if tks[i] != tks1[i]:
270
+ diff[i] = 1
271
+
272
+ if s1 > s:
273
+ tks = tks1
274
+
275
+ i = 0
276
+ while i < len(tks):
277
+ s = i
278
+ while s < len(tks) and diff[s] == 0:
279
+ s += 1
280
+ if s == len(tks):
281
+ res.append(" ".join(tks[i:]))
282
+ break
283
+ if s > i:
284
+ res.append(" ".join(tks[i:s]))
285
+
286
+ e = s
287
+ while e < len(tks) and e - s < 5 and diff[e] == 1:
288
+ e += 1
289
+
290
+ tkslist = []
291
+ self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
292
+ res.append(" ".join(self.sortTks_(tkslist)[0][0]))
293
+
294
+ i = e + 1
295
+
296
+ res = " ".join(res)
297
+ if self.DEBUG:
298
+ print("[TKS]", self.merge_(res))
299
+ return self.merge_(res)
300
+
301
+ def fine_grained_tokenize(self, tks):
302
+ tks = tks.split(" ")
303
+ zh_num = len([1 for c in tks if c and is_chinese(c[0])])
304
+ if zh_num < len(tks) * 0.2:
305
+ res = []
306
+ for tk in tks:
307
+ res.extend(tk.split("/"))
308
+ return " ".join(res)
309
+
310
+ res = []
311
+ for tk in tks:
312
+ if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
313
+ res.append(tk)
314
+ continue
315
+ tkslist = []
316
+ if len(tk) > 10:
317
+ tkslist.append(tk)
318
+ else:
319
+ self.dfs_(tk, 0, [], tkslist)
320
+ if len(tkslist) < 2:
321
+ res.append(tk)
322
+ continue
323
+ stk = self.sortTks_(tkslist)[1][0]
324
+ if len(stk) == len(tk):
325
+ stk = tk
326
+ else:
327
+ if re.match(r"[a-z\.-]+$", tk):
328
+ for t in stk:
329
+ if len(t) < 3:
330
+ stk = tk
331
+ break
332
+ else:
333
+ stk = " ".join(stk)
334
+ else:
335
+ stk = " ".join(stk)
336
+
337
+ res.append(stk)
338
+
339
+ return " ".join(res)
340
+
341
+
342
+ def is_chinese(s):
343
+ if s >= u'\u4e00' and s <= u'\u9fa5':
344
+ return True
345
+ else:
346
+ return False
347
+
348
+
349
+ def is_number(s):
350
+ if s >= u'\u0030' and s <= u'\u0039':
351
+ return True
352
+ else:
353
+ return False
354
+
355
+
356
+ def is_alphabet(s):
357
+ if (s >= u'\u0041' and s <= u'\u005a') or (
358
+ s >= u'\u0061' and s <= u'\u007a'):
359
+ return True
360
+ else:
361
+ return False
362
+
363
+
364
+ def naiveQie(txt):
365
+ tks = []
366
+ for t in txt.split(" "):
367
+ if tks and re.match(r".*[a-zA-Z]$", tks[-1]
368
+ ) and re.match(r".*[a-zA-Z]$", t):
369
+ tks.append(" ")
370
+ tks.append(t)
371
+ return tks
372
+
373
+
374
+ tokenizer = RagTokenizer()
375
+ tokenize = tokenizer.tokenize
376
+ fine_grained_tokenize = tokenizer.fine_grained_tokenize
377
+ tag = tokenizer.tag
378
+ freq = tokenizer.freq
379
+ loadUserDict = tokenizer.loadUserDict
380
+ addUserDict = tokenizer.addUserDict
381
+ tradi2simp = tokenizer._tradi2simp
382
+ strQ2B = tokenizer._strQ2B
383
+
384
+ if __name__ == '__main__':
385
+ tknzr = RagTokenizer(debug=True)
386
+ # huqie.addUserDict("/tmp/tmp.new.tks.dict")
387
+ tks = tknzr.tokenize(
388
+ "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
389
+ print(tknzr.fine_grained_tokenize(tks))
390
+ tks = tknzr.tokenize(
391
+ "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
392
+ print(tknzr.fine_grained_tokenize(tks))
393
+ tks = tknzr.tokenize(
394
+ "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
395
+ print(tknzr.fine_grained_tokenize(tks))
396
+ tks = tknzr.tokenize(
397
+ "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
398
+ print(tknzr.fine_grained_tokenize(tks))
399
+ tks = tknzr.tokenize("虽然我不怎么玩")
400
+ print(tknzr.fine_grained_tokenize(tks))
401
+ tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
402
+ print(tknzr.fine_grained_tokenize(tks))
403
+ tks = tknzr.tokenize(
404
+ "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
405
+ print(tknzr.fine_grained_tokenize(tks))
406
+ tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
407
+ print(tknzr.fine_grained_tokenize(tks))
408
+ tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
409
+ print(tknzr.fine_grained_tokenize(tks))
410
+ tks = tknzr.tokenize(
411
+ "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
412
+ print(tknzr.fine_grained_tokenize(tks))
413
+ if len(sys.argv) < 2:
414
+ sys.exit()
415
+ tknzr.DEBUG = False
416
+ tknzr.loadUserDict(sys.argv[1])
417
+ of = open(sys.argv[2], "r")
418
+ while True:
419
+ line = of.readline()
420
+ if not line:
421
+ break
422
+ print(tknzr.tokenize(line))
423
+ of.close()
rag/nlp/search.py CHANGED
@@ -9,7 +9,7 @@ from dataclasses import dataclass
9
 
10
  from rag.settings import es_logger
11
  from rag.utils import rmSpace
12
- from rag.nlp import huqie, query
13
  import numpy as np
14
 
15
 
@@ -128,7 +128,7 @@ class Dealer:
128
  kwds = set([])
129
  for k in keywords:
130
  kwds.add(k)
131
- for kk in huqie.qieqie(k).split(" "):
132
  if len(kk) < 2:
133
  continue
134
  if kk in kwds:
@@ -243,7 +243,7 @@ class Dealer:
243
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
244
  len(ans_v[0]), len(chunk_v[0]))
245
 
246
- chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
247
  for ck in chunks]
248
  cites = {}
249
  thr = 0.63
@@ -251,7 +251,7 @@ class Dealer:
251
  for i, a in enumerate(pieces_):
252
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
253
  chunk_v,
254
- huqie.qie(
255
  self.qryr.rmWWW(pieces_[i])).split(" "),
256
  chunks_tks,
257
  tkweight, vtweight)
@@ -310,8 +310,8 @@ class Dealer:
310
  def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
311
  return self.qryr.hybrid_similarity(ans_embd,
312
  ins_embd,
313
- huqie.qie(ans).split(" "),
314
- huqie.qie(inst).split(" "))
315
 
316
  def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
317
  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
@@ -385,7 +385,7 @@ class Dealer:
385
  for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
386
  fld, v = r.group(1), r.group(3)
387
  match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
388
- fld, huqie.qieqie(huqie.qie(v)))
389
  replaces.append(
390
  ("{}{}'{}'".format(
391
  r.group(1),
 
9
 
10
  from rag.settings import es_logger
11
  from rag.utils import rmSpace
12
+ from rag.nlp import rag_tokenizer, query
13
  import numpy as np
14
 
15
 
 
128
  kwds = set([])
129
  for k in keywords:
130
  kwds.add(k)
131
+ for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
132
  if len(kk) < 2:
133
  continue
134
  if kk in kwds:
 
243
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
244
  len(ans_v[0]), len(chunk_v[0]))
245
 
246
+ chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
247
  for ck in chunks]
248
  cites = {}
249
  thr = 0.63
 
251
  for i, a in enumerate(pieces_):
252
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
253
  chunk_v,
254
+ rag_tokenizer.tokenize(
255
  self.qryr.rmWWW(pieces_[i])).split(" "),
256
  chunks_tks,
257
  tkweight, vtweight)
 
310
  def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
311
  return self.qryr.hybrid_similarity(ans_embd,
312
  ins_embd,
313
+ rag_tokenizer.tokenize(ans).split(" "),
314
+ rag_tokenizer.tokenize(inst).split(" "))
315
 
316
  def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
317
  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
 
385
  for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
386
  fld, v = r.group(1), r.group(3)
387
  match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
388
+ fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
389
  replaces.append(
390
  ("{}{}'{}'".format(
391
  r.group(1),
rag/nlp/term_weight.py CHANGED
@@ -4,7 +4,7 @@ import json
4
  import re
5
  import os
6
  import numpy as np
7
- from rag.nlp import huqie
8
  from api.utils.file_utils import get_project_base_directory
9
 
10
 
@@ -83,7 +83,7 @@ class Dealer:
83
  txt = re.sub(p, r, txt)
84
 
85
  res = []
86
- for t in huqie.qie(txt).split(" "):
87
  tk = t
88
  if (stpwd and tk in self.stop_words) or (
89
  re.match(r"[0-9]$", tk) and not num):
@@ -161,7 +161,7 @@ class Dealer:
161
  return m[self.ne[t]]
162
 
163
  def postag(t):
164
- t = huqie.tag(t)
165
  if t in set(["r", "c", "d"]):
166
  return 0.3
167
  if t in set(["ns", "nt"]):
@@ -175,14 +175,14 @@ class Dealer:
175
  def freq(t):
176
  if re.match(r"[0-9. -]{2,}$", t):
177
  return 3
178
- s = huqie.freq(t)
179
  if not s and re.match(r"[a-z. -]+$", t):
180
  return 300
181
  if not s:
182
  s = 0
183
 
184
  if not s and len(t) >= 4:
185
- s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
186
  if len(s) > 1:
187
  s = np.min([freq(tt) for tt in s]) / 6.
188
  else:
@@ -198,7 +198,7 @@ class Dealer:
198
  elif re.match(r"[a-z. -]+$", t):
199
  return 300
200
  elif len(t) >= 4:
201
- s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
202
  if len(s) > 1:
203
  return max(3, np.min([df(tt) for tt in s]) / 6.)
204
 
 
4
  import re
5
  import os
6
  import numpy as np
7
+ from rag.nlp import rag_tokenizer
8
  from api.utils.file_utils import get_project_base_directory
9
 
10
 
 
83
  txt = re.sub(p, r, txt)
84
 
85
  res = []
86
+ for t in rag_tokenizer.tokenize(txt).split(" "):
87
  tk = t
88
  if (stpwd and tk in self.stop_words) or (
89
  re.match(r"[0-9]$", tk) and not num):
 
161
  return m[self.ne[t]]
162
 
163
  def postag(t):
164
+ t = rag_tokenizer.tag(t)
165
  if t in set(["r", "c", "d"]):
166
  return 0.3
167
  if t in set(["ns", "nt"]):
 
175
  def freq(t):
176
  if re.match(r"[0-9. -]{2,}$", t):
177
  return 3
178
+ s = rag_tokenizer.freq(t)
179
  if not s and re.match(r"[a-z. -]+$", t):
180
  return 300
181
  if not s:
182
  s = 0
183
 
184
  if not s and len(t) >= 4:
185
+ s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
186
  if len(s) > 1:
187
  s = np.min([freq(tt) for tt in s]) / 6.
188
  else:
 
198
  elif re.match(r"[a-z. -]+$", t):
199
  return 300
200
  elif len(t) >= 4:
201
+ s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
202
  if len(s) > 1:
203
  return max(3, np.min([df(tt) for tt in s]) / 6.)
204
 
rag/svr/cache_file_svr.py CHANGED
@@ -4,13 +4,14 @@ import traceback
4
 
5
  from api.db.db_models import close_connection
6
  from api.db.services.task_service import TaskService
 
7
  from rag.utils.minio_conn import MINIO
8
  from rag.utils.redis_conn import REDIS_CONN
9
 
10
 
11
  def collect():
12
  doc_locations = TaskService.get_ongoing_doc_name()
13
- #print(tasks)
14
  if len(doc_locations) == 0:
15
  time.sleep(1)
16
  return
@@ -28,7 +29,7 @@ def main():
28
  if REDIS_CONN.exist(key):continue
29
  file_bin = MINIO.get(kb_id, loc)
30
  REDIS_CONN.transaction(key, file_bin, 12 * 60)
31
- print("CACHE:", loc)
32
  except Exception as e:
33
  traceback.print_stack(e)
34
  except Exception as e:
 
4
 
5
  from api.db.db_models import close_connection
6
  from api.db.services.task_service import TaskService
7
+ from rag.settings import cron_logger
8
  from rag.utils.minio_conn import MINIO
9
  from rag.utils.redis_conn import REDIS_CONN
10
 
11
 
12
  def collect():
13
  doc_locations = TaskService.get_ongoing_doc_name()
14
+ print(doc_locations)
15
  if len(doc_locations) == 0:
16
  time.sleep(1)
17
  return
 
29
  if REDIS_CONN.exist(key):continue
30
  file_bin = MINIO.get(kb_id, loc)
31
  REDIS_CONN.transaction(key, file_bin, 12 * 60)
32
+ cron_logger.info("CACHE: {}".format(loc))
33
  except Exception as e:
34
  traceback.print_stack(e)
35
  except Exception as e:
rag/svr/task_broker.py CHANGED
@@ -21,7 +21,6 @@ from datetime import datetime
21
  from api.db.db_models import Task
22
  from api.db.db_utils import bulk_insert_into_db
23
  from api.db.services.file2document_service import File2DocumentService
24
- from api.db.services.file_service import FileService
25
  from api.db.services.task_service import TaskService
26
  from deepdoc.parser import PdfParser
27
  from deepdoc.parser.excel_parser import RAGFlowExcelParser
 
21
  from api.db.db_models import Task
22
  from api.db.db_utils import bulk_insert_into_db
23
  from api.db.services.file2document_service import File2DocumentService
 
24
  from api.db.services.task_service import TaskService
25
  from deepdoc.parser import PdfParser
26
  from deepdoc.parser.excel_parser import RAGFlowExcelParser