KevinHuSh
commited on
Commit
·
cfd6ece
1
Parent(s):
dcce454
refine code (#595)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- api/apps/chunk_app.py +8 -8
- api/db/services/task_service.py +2 -3
- deepdoc/parser/docx_parser.py +3 -3
- deepdoc/parser/pdf_parser.py +7 -7
- deepdoc/parser/resume/entities/corporations.py +3 -3
- deepdoc/parser/resume/step_two.py +26 -26
- deepdoc/vision/table_structure_recognizer.py +3 -3
- rag/app/book.py +3 -3
- rag/app/laws.py +3 -3
- rag/app/manual.py +3 -3
- rag/app/naive.py +3 -3
- rag/app/one.py +3 -3
- rag/app/paper.py +5 -5
- rag/app/presentation.py +3 -3
- rag/app/qa.py +4 -4
- rag/app/resume.py +6 -6
- rag/app/table.py +3 -3
- rag/nlp/__init__.py +3 -3
- rag/nlp/query.py +8 -8
- rag/nlp/rag_tokenizer.py +423 -0
- rag/nlp/search.py +7 -7
- rag/nlp/term_weight.py +6 -6
- rag/svr/cache_file_svr.py +3 -2
- rag/svr/task_broker.py +0 -1
api/apps/chunk_app.py
CHANGED
@@ -20,7 +20,7 @@ from flask_login import login_required, current_user
|
|
20 |
from elasticsearch_dsl import Q
|
21 |
|
22 |
from rag.app.qa import rmPrefix, beAdoc
|
23 |
-
from rag.nlp import search,
|
24 |
from rag.utils.es_conn import ELASTICSEARCH
|
25 |
from rag.utils import rmSpace
|
26 |
from api.db import LLMType, ParserType
|
@@ -125,10 +125,10 @@ def set():
|
|
125 |
d = {
|
126 |
"id": req["chunk_id"],
|
127 |
"content_with_weight": req["content_with_weight"]}
|
128 |
-
d["content_ltks"] =
|
129 |
-
d["content_sm_ltks"] =
|
130 |
d["important_kwd"] = req["important_kwd"]
|
131 |
-
d["important_tks"] =
|
132 |
if "available_int" in req:
|
133 |
d["available_int"] = req["available_int"]
|
134 |
|
@@ -152,7 +152,7 @@ def set():
|
|
152 |
retmsg="Q&A must be separated by TAB/ENTER key.")
|
153 |
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
154 |
d = beAdoc(d, arr[0], arr[1], not any(
|
155 |
-
[
|
156 |
|
157 |
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
158 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
@@ -202,11 +202,11 @@ def create():
|
|
202 |
md5 = hashlib.md5()
|
203 |
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
204 |
chunck_id = md5.hexdigest()
|
205 |
-
d = {"id": chunck_id, "content_ltks":
|
206 |
"content_with_weight": req["content_with_weight"]}
|
207 |
-
d["content_sm_ltks"] =
|
208 |
d["important_kwd"] = req.get("important_kwd", [])
|
209 |
-
d["important_tks"] =
|
210 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
211 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
212 |
|
|
|
20 |
from elasticsearch_dsl import Q
|
21 |
|
22 |
from rag.app.qa import rmPrefix, beAdoc
|
23 |
+
from rag.nlp import search, rag_tokenizer
|
24 |
from rag.utils.es_conn import ELASTICSEARCH
|
25 |
from rag.utils import rmSpace
|
26 |
from api.db import LLMType, ParserType
|
|
|
125 |
d = {
|
126 |
"id": req["chunk_id"],
|
127 |
"content_with_weight": req["content_with_weight"]}
|
128 |
+
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
|
129 |
+
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
130 |
d["important_kwd"] = req["important_kwd"]
|
131 |
+
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
132 |
if "available_int" in req:
|
133 |
d["available_int"] = req["available_int"]
|
134 |
|
|
|
152 |
retmsg="Q&A must be separated by TAB/ENTER key.")
|
153 |
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
154 |
d = beAdoc(d, arr[0], arr[1], not any(
|
155 |
+
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
156 |
|
157 |
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
158 |
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
|
|
202 |
md5 = hashlib.md5()
|
203 |
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
204 |
chunck_id = md5.hexdigest()
|
205 |
+
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
206 |
"content_with_weight": req["content_with_weight"]}
|
207 |
+
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
208 |
d["important_kwd"] = req.get("important_kwd", [])
|
209 |
+
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
|
210 |
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
211 |
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
212 |
|
api/db/services/task_service.py
CHANGED
@@ -78,14 +78,13 @@ class TaskService(CommonService):
|
|
78 |
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
|
79 |
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
80 |
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
|
81 |
-
.join(File, on=(File2Document.file_id == File.id)) \
|
82 |
.where(
|
83 |
Document.status == StatusEnum.VALID.value,
|
84 |
Document.run == TaskStatus.RUNNING.value,
|
85 |
~(Document.type == FileType.VIRTUAL.value),
|
86 |
-
cls.model.progress >= 0,
|
87 |
cls.model.progress < 1,
|
88 |
-
cls.model.create_time >= current_timestamp() -
|
89 |
)
|
90 |
docs = list(docs.dicts())
|
91 |
if not docs: return []
|
|
|
78 |
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
|
79 |
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
80 |
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
|
81 |
+
.join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
|
82 |
.where(
|
83 |
Document.status == StatusEnum.VALID.value,
|
84 |
Document.run == TaskStatus.RUNNING.value,
|
85 |
~(Document.type == FileType.VIRTUAL.value),
|
|
|
86 |
cls.model.progress < 1,
|
87 |
+
cls.model.create_time >= current_timestamp() - 1000 * 600
|
88 |
)
|
89 |
docs = list(docs.dicts())
|
90 |
if not docs: return []
|
deepdoc/parser/docx_parser.py
CHANGED
@@ -3,7 +3,7 @@ from docx import Document
|
|
3 |
import re
|
4 |
import pandas as pd
|
5 |
from collections import Counter
|
6 |
-
from rag.nlp import
|
7 |
from io import BytesIO
|
8 |
|
9 |
|
@@ -35,14 +35,14 @@ class RAGFlowDocxParser:
|
|
35 |
for p, n in patt:
|
36 |
if re.search(p, b):
|
37 |
return n
|
38 |
-
tks = [t for t in
|
39 |
if len(tks) > 3:
|
40 |
if len(tks) < 12:
|
41 |
return "Tx"
|
42 |
else:
|
43 |
return "Lx"
|
44 |
|
45 |
-
if len(tks) == 1 and
|
46 |
return "Nr"
|
47 |
|
48 |
return "Ot"
|
|
|
3 |
import re
|
4 |
import pandas as pd
|
5 |
from collections import Counter
|
6 |
+
from rag.nlp import rag_tokenizer
|
7 |
from io import BytesIO
|
8 |
|
9 |
|
|
|
35 |
for p, n in patt:
|
36 |
if re.search(p, b):
|
37 |
return n
|
38 |
+
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
|
39 |
if len(tks) > 3:
|
40 |
if len(tks) < 12:
|
41 |
return "Tx"
|
42 |
else:
|
43 |
return "Lx"
|
44 |
|
45 |
+
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
46 |
return "Nr"
|
47 |
|
48 |
return "Ot"
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -16,7 +16,7 @@ from PyPDF2 import PdfReader as pdf2_read
|
|
16 |
|
17 |
from api.utils.file_utils import get_project_base_directory
|
18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
19 |
-
from rag.nlp import
|
20 |
from copy import deepcopy
|
21 |
from huggingface_hub import snapshot_download
|
22 |
|
@@ -95,13 +95,13 @@ class RAGFlowPdfParser:
|
|
95 |
h = max(self.__height(up), self.__height(down))
|
96 |
y_dis = self._y_dis(up, down)
|
97 |
LEN = 6
|
98 |
-
tks_down =
|
99 |
-
tks_up =
|
100 |
tks_all = up["text"][-LEN:].strip() \
|
101 |
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
102 |
up["text"][-1] + down["text"][0]) else "") \
|
103 |
+ down["text"][:LEN].strip()
|
104 |
-
tks_all =
|
105 |
fea = [
|
106 |
up.get("R", -1) == down.get("R", -1),
|
107 |
y_dis / h,
|
@@ -142,8 +142,8 @@ class RAGFlowPdfParser:
|
|
142 |
tks_down[-1] == tks_up[-1],
|
143 |
max(down["in_row"], up["in_row"]),
|
144 |
abs(down["in_row"] - up["in_row"]),
|
145 |
-
len(tks_down) == 1 and
|
146 |
-
len(tks_up) == 1 and
|
147 |
]
|
148 |
return fea
|
149 |
|
@@ -599,7 +599,7 @@ class RAGFlowPdfParser:
|
|
599 |
|
600 |
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
601 |
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
602 |
-
or
|
603 |
or b["top"] > b_["bottom"]:
|
604 |
i += 1
|
605 |
continue
|
|
|
16 |
|
17 |
from api.utils.file_utils import get_project_base_directory
|
18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
19 |
+
from rag.nlp import rag_tokenizer
|
20 |
from copy import deepcopy
|
21 |
from huggingface_hub import snapshot_download
|
22 |
|
|
|
95 |
h = max(self.__height(up), self.__height(down))
|
96 |
y_dis = self._y_dis(up, down)
|
97 |
LEN = 6
|
98 |
+
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
|
99 |
+
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
|
100 |
tks_all = up["text"][-LEN:].strip() \
|
101 |
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
102 |
up["text"][-1] + down["text"][0]) else "") \
|
103 |
+ down["text"][:LEN].strip()
|
104 |
+
tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
|
105 |
fea = [
|
106 |
up.get("R", -1) == down.get("R", -1),
|
107 |
y_dis / h,
|
|
|
142 |
tks_down[-1] == tks_up[-1],
|
143 |
max(down["in_row"], up["in_row"]),
|
144 |
abs(down["in_row"] - up["in_row"]),
|
145 |
+
len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
|
146 |
+
len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
|
147 |
]
|
148 |
return fea
|
149 |
|
|
|
599 |
|
600 |
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
601 |
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
602 |
+
or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
|
603 |
or b["top"] > b_["bottom"]:
|
604 |
i += 1
|
605 |
continue
|
deepdoc/parser/resume/entities/corporations.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import re,json,os
|
2 |
import pandas as pd
|
3 |
-
from rag.nlp import
|
4 |
from . import regions
|
5 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
6 |
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
@@ -22,14 +22,14 @@ def baike(cid, default_v=0):
|
|
22 |
def corpNorm(nm, add_region=True):
|
23 |
global CORP_TKS
|
24 |
if not nm or type(nm)!=type(""):return ""
|
25 |
-
nm =
|
26 |
nm = re.sub(r"&", "&", nm)
|
27 |
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
28 |
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
29 |
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
30 |
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
31 |
|
32 |
-
tks =
|
33 |
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
34 |
nm = ""
|
35 |
for t in tks:
|
|
|
1 |
import re,json,os
|
2 |
import pandas as pd
|
3 |
+
from rag.nlp import rag_tokenizer
|
4 |
from . import regions
|
5 |
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
6 |
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
|
|
22 |
def corpNorm(nm, add_region=True):
|
23 |
global CORP_TKS
|
24 |
if not nm or type(nm)!=type(""):return ""
|
25 |
+
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
|
26 |
nm = re.sub(r"&", "&", nm)
|
27 |
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
28 |
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
29 |
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
30 |
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
31 |
|
32 |
+
tks = rag_tokenizer.tokenize(nm).split(" ")
|
33 |
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
34 |
nm = ""
|
35 |
for t in tks:
|
deepdoc/parser/resume/step_two.py
CHANGED
@@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \
|
|
3 |
traceback, signal
|
4 |
import numpy as np
|
5 |
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
6 |
-
from rag.nlp import
|
7 |
from xpinyin import Pinyin
|
8 |
from contextlib import contextmanager
|
9 |
|
@@ -83,7 +83,7 @@ def forEdu(cv):
|
|
83 |
if n.get("school_name") and isinstance(n["school_name"], str):
|
84 |
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
85 |
e["sch_nm_kwd"] = sch[-1]
|
86 |
-
fea.append(
|
87 |
|
88 |
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
89 |
maj.append(n["discipline_name"])
|
@@ -166,10 +166,10 @@ def forEdu(cv):
|
|
166 |
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
167 |
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
168 |
|
169 |
-
if cv.get("major_kwd"): cv["major_tks"] =
|
170 |
-
if cv.get("school_name_kwd"): cv["school_name_tks"] =
|
171 |
-
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] =
|
172 |
-
if cv.get("first_major_kwd"): cv["first_major_tks"] =
|
173 |
|
174 |
return cv
|
175 |
|
@@ -187,11 +187,11 @@ def forProj(cv):
|
|
187 |
if n.get("achivement"): desc.append(str(n["achivement"]))
|
188 |
|
189 |
if pro_nms:
|
190 |
-
# cv["pro_nms_tks"] =
|
191 |
-
cv["project_name_tks"] =
|
192 |
if desc:
|
193 |
-
cv["pro_desc_ltks"] =
|
194 |
-
cv["project_desc_ltks"] =
|
195 |
|
196 |
return cv
|
197 |
|
@@ -280,25 +280,25 @@ def forWork(cv):
|
|
280 |
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
281 |
|
282 |
if fea["position_name"]:
|
283 |
-
cv["position_name_tks"] =
|
284 |
-
cv["position_name_sm_tks"] =
|
285 |
-
cv["pos_nm_tks"] =
|
286 |
|
287 |
if fea["industry_name"]:
|
288 |
-
cv["industry_name_tks"] =
|
289 |
-
cv["industry_name_sm_tks"] =
|
290 |
-
cv["indu_nm_tks"] =
|
291 |
|
292 |
if fea["corporation_name"]:
|
293 |
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
294 |
cv["corp_nm_kwd"] = fea["corporation_name"]
|
295 |
-
cv["corporation_name_tks"] =
|
296 |
-
cv["corporation_name_sm_tks"] =
|
297 |
-
cv["corp_nm_tks"] =
|
298 |
|
299 |
if fea["responsibilities"]:
|
300 |
-
cv["responsibilities_ltks"] =
|
301 |
-
cv["resp_ltks"] =
|
302 |
|
303 |
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
304 |
re.match(r"[^0-9]+$", str(i))]
|
@@ -444,15 +444,15 @@ def parse(cv):
|
|
444 |
if nms:
|
445 |
t = k[:-4]
|
446 |
cv[f"{t}_kwd"] = nms
|
447 |
-
cv[f"{t}_tks"] =
|
448 |
except Exception as e:
|
449 |
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
450 |
cv[k] = []
|
451 |
|
452 |
# tokenize fields
|
453 |
if k in tks_fld:
|
454 |
-
cv[f"{k}_tks"] =
|
455 |
-
if k in small_tks_fld: cv[f"{k}_sm_tks"] =
|
456 |
|
457 |
# keyword fields
|
458 |
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
@@ -492,7 +492,7 @@ def parse(cv):
|
|
492 |
cv["name_kwd"] = name
|
493 |
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
494 |
cv["name_tks"] = (
|
495 |
-
|
496 |
) if name else ""
|
497 |
else:
|
498 |
cv["integerity_flt"] /= 2.
|
@@ -515,7 +515,7 @@ def parse(cv):
|
|
515 |
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
516 |
# long text tokenize
|
517 |
|
518 |
-
if cv.get("responsibilities"): cv["responsibilities_ltks"] =
|
519 |
|
520 |
# for yes or no field
|
521 |
fea = []
|
|
|
3 |
traceback, signal
|
4 |
import numpy as np
|
5 |
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
6 |
+
from rag.nlp import rag_tokenizer, surname
|
7 |
from xpinyin import Pinyin
|
8 |
from contextlib import contextmanager
|
9 |
|
|
|
83 |
if n.get("school_name") and isinstance(n["school_name"], str):
|
84 |
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
85 |
e["sch_nm_kwd"] = sch[-1]
|
86 |
+
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
|
87 |
|
88 |
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
89 |
maj.append(n["discipline_name"])
|
|
|
166 |
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
167 |
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
168 |
|
169 |
+
if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
170 |
+
if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
171 |
+
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
172 |
+
if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
173 |
|
174 |
return cv
|
175 |
|
|
|
187 |
if n.get("achivement"): desc.append(str(n["achivement"]))
|
188 |
|
189 |
if pro_nms:
|
190 |
+
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
191 |
+
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
|
192 |
if desc:
|
193 |
+
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
|
194 |
+
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
|
195 |
|
196 |
return cv
|
197 |
|
|
|
280 |
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
281 |
|
282 |
if fea["position_name"]:
|
283 |
+
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
284 |
+
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
|
285 |
+
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
|
286 |
|
287 |
if fea["industry_name"]:
|
288 |
+
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
|
289 |
+
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
|
290 |
+
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
|
291 |
|
292 |
if fea["corporation_name"]:
|
293 |
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
294 |
cv["corp_nm_kwd"] = fea["corporation_name"]
|
295 |
+
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
|
296 |
+
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
|
297 |
+
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
|
298 |
|
299 |
if fea["responsibilities"]:
|
300 |
+
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
301 |
+
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
302 |
|
303 |
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
304 |
re.match(r"[^0-9]+$", str(i))]
|
|
|
444 |
if nms:
|
445 |
t = k[:-4]
|
446 |
cv[f"{t}_kwd"] = nms
|
447 |
+
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
448 |
except Exception as e:
|
449 |
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
450 |
cv[k] = []
|
451 |
|
452 |
# tokenize fields
|
453 |
if k in tks_fld:
|
454 |
+
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
455 |
+
if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
456 |
|
457 |
# keyword fields
|
458 |
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
|
|
492 |
cv["name_kwd"] = name
|
493 |
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
494 |
cv["name_tks"] = (
|
495 |
+
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
496 |
) if name else ""
|
497 |
else:
|
498 |
cv["integerity_flt"] /= 2.
|
|
|
515 |
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
516 |
# long text tokenize
|
517 |
|
518 |
+
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
519 |
|
520 |
# for yes or no field
|
521 |
fea = []
|
deepdoc/vision/table_structure_recognizer.py
CHANGED
@@ -19,7 +19,7 @@ import numpy as np
|
|
19 |
from huggingface_hub import snapshot_download
|
20 |
|
21 |
from api.utils.file_utils import get_project_base_directory
|
22 |
-
from rag.nlp import
|
23 |
from .recognizer import Recognizer
|
24 |
|
25 |
|
@@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer):
|
|
117 |
for p, n in patt:
|
118 |
if re.search(p, b["text"].strip()):
|
119 |
return n
|
120 |
-
tks = [t for t in
|
121 |
if len(tks) > 3:
|
122 |
if len(tks) < 12:
|
123 |
return "Tx"
|
124 |
else:
|
125 |
return "Lx"
|
126 |
|
127 |
-
if len(tks) == 1 and
|
128 |
return "Nr"
|
129 |
|
130 |
return "Ot"
|
|
|
19 |
from huggingface_hub import snapshot_download
|
20 |
|
21 |
from api.utils.file_utils import get_project_base_directory
|
22 |
+
from rag.nlp import rag_tokenizer
|
23 |
from .recognizer import Recognizer
|
24 |
|
25 |
|
|
|
117 |
for p, n in patt:
|
118 |
if re.search(p, b["text"].strip()):
|
119 |
return n
|
120 |
+
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
|
121 |
if len(tks) > 3:
|
122 |
if len(tks) < 12:
|
123 |
return "Tx"
|
124 |
else:
|
125 |
return "Lx"
|
126 |
|
127 |
+
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
128 |
return "Nr"
|
129 |
|
130 |
return "Ot"
|
rag/app/book.py
CHANGED
@@ -18,7 +18,7 @@ from io import BytesIO
|
|
18 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
19 |
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
20 |
tokenize_chunks, find_codec
|
21 |
-
from rag.nlp import
|
22 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
23 |
|
24 |
|
@@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
63 |
"""
|
64 |
doc = {
|
65 |
"docnm_kwd": filename,
|
66 |
-
"title_tks":
|
67 |
}
|
68 |
-
doc["title_sm_tks"] =
|
69 |
pdf_parser = None
|
70 |
sections, tbls = [], []
|
71 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
|
|
18 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
19 |
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
20 |
tokenize_chunks, find_codec
|
21 |
+
from rag.nlp import rag_tokenizer
|
22 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
23 |
|
24 |
|
|
|
63 |
"""
|
64 |
doc = {
|
65 |
"docnm_kwd": filename,
|
66 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
67 |
}
|
68 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
69 |
pdf_parser = None
|
70 |
sections, tbls = [], []
|
71 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
rag/app/laws.py
CHANGED
@@ -19,7 +19,7 @@ from docx import Document
|
|
19 |
from api.db import ParserType
|
20 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
21 |
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
22 |
-
from rag.nlp import
|
23 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
24 |
from rag.settings import cron_logger
|
25 |
|
@@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
89 |
"""
|
90 |
doc = {
|
91 |
"docnm_kwd": filename,
|
92 |
-
"title_tks":
|
93 |
}
|
94 |
-
doc["title_sm_tks"] =
|
95 |
pdf_parser = None
|
96 |
sections = []
|
97 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
|
|
19 |
from api.db import ParserType
|
20 |
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
21 |
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
22 |
+
from rag.nlp import rag_tokenizer
|
23 |
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
24 |
from rag.settings import cron_logger
|
25 |
|
|
|
89 |
"""
|
90 |
doc = {
|
91 |
"docnm_kwd": filename,
|
92 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
93 |
}
|
94 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
95 |
pdf_parser = None
|
96 |
sections = []
|
97 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
rag/app/manual.py
CHANGED
@@ -2,7 +2,7 @@ import copy
|
|
2 |
import re
|
3 |
|
4 |
from api.db import ParserType
|
5 |
-
from rag.nlp import
|
6 |
from deepdoc.parser import PdfParser, PlainParser
|
7 |
from rag.utils import num_tokens_from_string
|
8 |
|
@@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
70 |
doc = {
|
71 |
"docnm_kwd": filename
|
72 |
}
|
73 |
-
doc["title_tks"] =
|
74 |
-
doc["title_sm_tks"] =
|
75 |
# is it English
|
76 |
eng = lang.lower() == "english" # pdf_parser.is_english
|
77 |
|
|
|
2 |
import re
|
3 |
|
4 |
from api.db import ParserType
|
5 |
+
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
6 |
from deepdoc.parser import PdfParser, PlainParser
|
7 |
from rag.utils import num_tokens_from_string
|
8 |
|
|
|
70 |
doc = {
|
71 |
"docnm_kwd": filename
|
72 |
}
|
73 |
+
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
74 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
75 |
# is it English
|
76 |
eng = lang.lower() == "english" # pdf_parser.is_english
|
77 |
|
rag/app/naive.py
CHANGED
@@ -16,7 +16,7 @@ from docx import Document
|
|
16 |
from timeit import default_timer as timer
|
17 |
import re
|
18 |
from deepdoc.parser.pdf_parser import PlainParser
|
19 |
-
from rag.nlp import
|
20 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
21 |
from rag.settings import cron_logger
|
22 |
|
@@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
112 |
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
113 |
doc = {
|
114 |
"docnm_kwd": filename,
|
115 |
-
"title_tks":
|
116 |
}
|
117 |
-
doc["title_sm_tks"] =
|
118 |
res = []
|
119 |
pdf_parser = None
|
120 |
sections = []
|
|
|
16 |
from timeit import default_timer as timer
|
17 |
import re
|
18 |
from deepdoc.parser.pdf_parser import PlainParser
|
19 |
+
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
20 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
21 |
from rag.settings import cron_logger
|
22 |
|
|
|
112 |
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
113 |
doc = {
|
114 |
"docnm_kwd": filename,
|
115 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
116 |
}
|
117 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
118 |
res = []
|
119 |
pdf_parser = None
|
120 |
sections = []
|
rag/app/one.py
CHANGED
@@ -14,7 +14,7 @@ from tika import parser
|
|
14 |
from io import BytesIO
|
15 |
import re
|
16 |
from rag.app import laws
|
17 |
-
from rag.nlp import
|
18 |
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
19 |
|
20 |
|
@@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
111 |
|
112 |
doc = {
|
113 |
"docnm_kwd": filename,
|
114 |
-
"title_tks":
|
115 |
}
|
116 |
-
doc["title_sm_tks"] =
|
117 |
tokenize(doc, "\n".join(sections), eng)
|
118 |
return [doc]
|
119 |
|
|
|
14 |
from io import BytesIO
|
15 |
import re
|
16 |
from rag.app import laws
|
17 |
+
from rag.nlp import rag_tokenizer, tokenize, find_codec
|
18 |
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
19 |
|
20 |
|
|
|
111 |
|
112 |
doc = {
|
113 |
"docnm_kwd": filename,
|
114 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
115 |
}
|
116 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
117 |
tokenize(doc, "\n".join(sections), eng)
|
118 |
return [doc]
|
119 |
|
rag/app/paper.py
CHANGED
@@ -15,7 +15,7 @@ import re
|
|
15 |
from collections import Counter
|
16 |
|
17 |
from api.db import ParserType
|
18 |
-
from rag.nlp import
|
19 |
from deepdoc.parser import PdfParser, PlainParser
|
20 |
import numpy as np
|
21 |
from rag.utils import num_tokens_from_string
|
@@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
153 |
else:
|
154 |
raise NotImplementedError("file type not supported yet(pdf supported)")
|
155 |
|
156 |
-
doc = {"docnm_kwd": filename, "authors_tks":
|
157 |
-
"title_tks":
|
158 |
-
doc["title_sm_tks"] =
|
159 |
-
doc["authors_sm_tks"] =
|
160 |
# is it English
|
161 |
eng = lang.lower() == "english" # pdf_parser.is_english
|
162 |
print("It's English.....", eng)
|
|
|
15 |
from collections import Counter
|
16 |
|
17 |
from api.db import ParserType
|
18 |
+
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
19 |
from deepdoc.parser import PdfParser, PlainParser
|
20 |
import numpy as np
|
21 |
from rag.utils import num_tokens_from_string
|
|
|
153 |
else:
|
154 |
raise NotImplementedError("file type not supported yet(pdf supported)")
|
155 |
|
156 |
+
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
|
157 |
+
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
|
158 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
159 |
+
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
160 |
# is it English
|
161 |
eng = lang.lower() == "english" # pdf_parser.is_english
|
162 |
print("It's English.....", eng)
|
rag/app/presentation.py
CHANGED
@@ -17,7 +17,7 @@ from io import BytesIO
|
|
17 |
from PIL import Image
|
18 |
|
19 |
from rag.nlp import tokenize, is_english
|
20 |
-
from rag.nlp import
|
21 |
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
22 |
from PyPDF2 import PdfReader as pdf2_read
|
23 |
|
@@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
96 |
eng = lang.lower() == "english"
|
97 |
doc = {
|
98 |
"docnm_kwd": filename,
|
99 |
-
"title_tks":
|
100 |
}
|
101 |
-
doc["title_sm_tks"] =
|
102 |
res = []
|
103 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
104 |
ppt_parser = Ppt()
|
|
|
17 |
from PIL import Image
|
18 |
|
19 |
from rag.nlp import tokenize, is_english
|
20 |
+
from rag.nlp import rag_tokenizer
|
21 |
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
22 |
from PyPDF2 import PdfReader as pdf2_read
|
23 |
|
|
|
96 |
eng = lang.lower() == "english"
|
97 |
doc = {
|
98 |
"docnm_kwd": filename,
|
99 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
100 |
}
|
101 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
102 |
res = []
|
103 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
104 |
ppt_parser = Ppt()
|
rag/app/qa.py
CHANGED
@@ -16,7 +16,7 @@ from io import BytesIO
|
|
16 |
from nltk import word_tokenize
|
17 |
from openpyxl import load_workbook
|
18 |
from rag.nlp import is_english, random_choices, find_codec
|
19 |
-
from rag.nlp import
|
20 |
from deepdoc.parser import ExcelParser
|
21 |
|
22 |
|
@@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
|
|
73 |
aprefix = "Answer: " if eng else "回答:"
|
74 |
d["content_with_weight"] = "\t".join(
|
75 |
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
76 |
-
d["content_ltks"] =
|
77 |
-
d["content_sm_ltks"] =
|
78 |
return d
|
79 |
|
80 |
|
@@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|
94 |
res = []
|
95 |
doc = {
|
96 |
"docnm_kwd": filename,
|
97 |
-
"title_tks":
|
98 |
}
|
99 |
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
100 |
callback(0.1, "Start to parse.")
|
|
|
16 |
from nltk import word_tokenize
|
17 |
from openpyxl import load_workbook
|
18 |
from rag.nlp import is_english, random_choices, find_codec
|
19 |
+
from rag.nlp import rag_tokenizer
|
20 |
from deepdoc.parser import ExcelParser
|
21 |
|
22 |
|
|
|
73 |
aprefix = "Answer: " if eng else "回答:"
|
74 |
d["content_with_weight"] = "\t".join(
|
75 |
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
76 |
+
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
77 |
+
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
78 |
return d
|
79 |
|
80 |
|
|
|
94 |
res = []
|
95 |
doc = {
|
96 |
"docnm_kwd": filename,
|
97 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
98 |
}
|
99 |
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
100 |
callback(0.1, "Start to parse.")
|
rag/app/resume.py
CHANGED
@@ -18,7 +18,7 @@ import re
|
|
18 |
import pandas as pd
|
19 |
import requests
|
20 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
21 |
-
from rag.nlp import
|
22 |
from deepdoc.parser.resume import refactor
|
23 |
from deepdoc.parser.resume import step_one, step_two
|
24 |
from rag.settings import cron_logger
|
@@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|
131 |
titles.append(str(v))
|
132 |
doc = {
|
133 |
"docnm_kwd": filename,
|
134 |
-
"title_tks":
|
135 |
}
|
136 |
-
doc["title_sm_tks"] =
|
137 |
pairs = []
|
138 |
for n, m in field_map.items():
|
139 |
if not resume.get(n):
|
@@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|
147 |
|
148 |
doc["content_with_weight"] = "\n".join(
|
149 |
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
150 |
-
doc["content_ltks"] =
|
151 |
-
doc["content_sm_ltks"] =
|
152 |
for n, _ in field_map.items():
|
153 |
if n not in resume:
|
154 |
continue
|
@@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|
156 |
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
157 |
resume[n] = resume[n][0]
|
158 |
if n.find("_tks") > 0:
|
159 |
-
resume[n] =
|
160 |
doc[n] = resume[n]
|
161 |
|
162 |
print(doc)
|
|
|
18 |
import pandas as pd
|
19 |
import requests
|
20 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
21 |
+
from rag.nlp import rag_tokenizer
|
22 |
from deepdoc.parser.resume import refactor
|
23 |
from deepdoc.parser.resume import step_one, step_two
|
24 |
from rag.settings import cron_logger
|
|
|
131 |
titles.append(str(v))
|
132 |
doc = {
|
133 |
"docnm_kwd": filename,
|
134 |
+
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
|
135 |
}
|
136 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
137 |
pairs = []
|
138 |
for n, m in field_map.items():
|
139 |
if not resume.get(n):
|
|
|
147 |
|
148 |
doc["content_with_weight"] = "\n".join(
|
149 |
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
150 |
+
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
|
151 |
+
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
|
152 |
for n, _ in field_map.items():
|
153 |
if n not in resume:
|
154 |
continue
|
|
|
156 |
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
157 |
resume[n] = resume[n][0]
|
158 |
if n.find("_tks") > 0:
|
159 |
+
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
|
160 |
doc[n] = resume[n]
|
161 |
|
162 |
print(doc)
|
rag/app/table.py
CHANGED
@@ -20,7 +20,7 @@ from openpyxl import load_workbook
|
|
20 |
from dateutil.parser import parse as datetime_parse
|
21 |
|
22 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
23 |
-
from rag.nlp import
|
24 |
from deepdoc.parser import ExcelParser
|
25 |
|
26 |
|
@@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|
216 |
for ii, row in df.iterrows():
|
217 |
d = {
|
218 |
"docnm_kwd": filename,
|
219 |
-
"title_tks":
|
220 |
}
|
221 |
row_txt = []
|
222 |
for j in range(len(clmns)):
|
@@ -227,7 +227,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|
227 |
if pd.isna(row[clmns[j]]):
|
228 |
continue
|
229 |
fld = clmns_map[j][0]
|
230 |
-
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else
|
231 |
row[clmns[j]])
|
232 |
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
233 |
if not row_txt:
|
|
|
20 |
from dateutil.parser import parse as datetime_parse
|
21 |
|
22 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
23 |
+
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
|
24 |
from deepdoc.parser import ExcelParser
|
25 |
|
26 |
|
|
|
216 |
for ii, row in df.iterrows():
|
217 |
d = {
|
218 |
"docnm_kwd": filename,
|
219 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
220 |
}
|
221 |
row_txt = []
|
222 |
for j in range(len(clmns)):
|
|
|
227 |
if pd.isna(row[clmns[j]]):
|
228 |
continue
|
229 |
fld = clmns_map[j][0]
|
230 |
+
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
|
231 |
row[clmns[j]])
|
232 |
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
233 |
if not row_txt:
|
rag/nlp/__init__.py
CHANGED
@@ -2,7 +2,7 @@ import random
|
|
2 |
from collections import Counter
|
3 |
|
4 |
from rag.utils import num_tokens_from_string
|
5 |
-
from . import
|
6 |
import re
|
7 |
import copy
|
8 |
|
@@ -109,8 +109,8 @@ def is_english(texts):
|
|
109 |
def tokenize(d, t, eng):
|
110 |
d["content_with_weight"] = t
|
111 |
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
112 |
-
d["content_ltks"] =
|
113 |
-
d["content_sm_ltks"] =
|
114 |
|
115 |
|
116 |
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
|
|
2 |
from collections import Counter
|
3 |
|
4 |
from rag.utils import num_tokens_from_string
|
5 |
+
from . import rag_tokenizer
|
6 |
import re
|
7 |
import copy
|
8 |
|
|
|
109 |
def tokenize(d, t, eng):
|
110 |
d["content_with_weight"] = t
|
111 |
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
112 |
+
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
113 |
+
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
114 |
|
115 |
|
116 |
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
rag/nlp/query.py
CHANGED
@@ -7,7 +7,7 @@ import logging
|
|
7 |
import copy
|
8 |
from elasticsearch_dsl import Q
|
9 |
|
10 |
-
from rag.nlp import
|
11 |
|
12 |
|
13 |
class EsQueryer:
|
@@ -47,13 +47,13 @@ class EsQueryer:
|
|
47 |
txt = re.sub(
|
48 |
r"[ \r\n\t,,。??/`!!&]+",
|
49 |
" ",
|
50 |
-
|
51 |
-
|
52 |
txt.lower()))).strip()
|
53 |
txt = EsQueryer.rmWWW(txt)
|
54 |
|
55 |
if not self.isChinese(txt):
|
56 |
-
tks =
|
57 |
q = copy.deepcopy(tks)
|
58 |
for i in range(1, len(tks)):
|
59 |
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
@@ -65,7 +65,7 @@ class EsQueryer:
|
|
65 |
boost=1)#, minimum_should_match=min_match)
|
66 |
), tks
|
67 |
|
68 |
-
def
|
69 |
if len(tk) < 4:
|
70 |
return False
|
71 |
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
@@ -81,7 +81,7 @@ class EsQueryer:
|
|
81 |
logging.info(json.dumps(twts, ensure_ascii=False))
|
82 |
tms = []
|
83 |
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
84 |
-
sm =
|
85 |
sm = [
|
86 |
re.sub(
|
87 |
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
@@ -110,10 +110,10 @@ class EsQueryer:
|
|
110 |
if len(twts) > 1:
|
111 |
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
112 |
if re.match(r"[0-9a-z ]+$", tt):
|
113 |
-
tms = f"(\"{tt}\" OR \"%s\")" %
|
114 |
|
115 |
syns = " OR ".join(
|
116 |
-
["\"%s\"^0.7" % EsQueryer.subSpecialChar(
|
117 |
if syns:
|
118 |
tms = f"({tms})^5 OR ({syns})^0.7"
|
119 |
|
|
|
7 |
import copy
|
8 |
from elasticsearch_dsl import Q
|
9 |
|
10 |
+
from rag.nlp import rag_tokenizer, term_weight, synonym
|
11 |
|
12 |
|
13 |
class EsQueryer:
|
|
|
47 |
txt = re.sub(
|
48 |
r"[ \r\n\t,,。??/`!!&]+",
|
49 |
" ",
|
50 |
+
rag_tokenizer.tradi2simp(
|
51 |
+
rag_tokenizer.strQ2B(
|
52 |
txt.lower()))).strip()
|
53 |
txt = EsQueryer.rmWWW(txt)
|
54 |
|
55 |
if not self.isChinese(txt):
|
56 |
+
tks = rag_tokenizer.tokenize(txt).split(" ")
|
57 |
q = copy.deepcopy(tks)
|
58 |
for i in range(1, len(tks)):
|
59 |
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
|
|
65 |
boost=1)#, minimum_should_match=min_match)
|
66 |
), tks
|
67 |
|
68 |
+
def need_fine_grained_tokenize(tk):
|
69 |
if len(tk) < 4:
|
70 |
return False
|
71 |
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
|
|
81 |
logging.info(json.dumps(twts, ensure_ascii=False))
|
82 |
tms = []
|
83 |
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
84 |
+
sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
|
85 |
sm = [
|
86 |
re.sub(
|
87 |
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
|
|
110 |
if len(twts) > 1:
|
111 |
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
112 |
if re.match(r"[0-9a-z ]+$", tt):
|
113 |
+
tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)
|
114 |
|
115 |
syns = " OR ".join(
|
116 |
+
["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
|
117 |
if syns:
|
118 |
tms = f"({tms})^5 OR ({syns})^0.7"
|
119 |
|
rag/nlp/rag_tokenizer.py
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import copy
|
4 |
+
import datrie
|
5 |
+
import math
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import string
|
9 |
+
import sys
|
10 |
+
from hanziconv import HanziConv
|
11 |
+
from huggingface_hub import snapshot_download
|
12 |
+
from nltk import word_tokenize
|
13 |
+
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
14 |
+
from api.utils.file_utils import get_project_base_directory
|
15 |
+
|
16 |
+
|
17 |
+
class RagTokenizer:
|
18 |
+
def key_(self, line):
|
19 |
+
return str(line.lower().encode("utf-8"))[2:-1]
|
20 |
+
|
21 |
+
def rkey_(self, line):
|
22 |
+
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
|
23 |
+
|
24 |
+
def loadDict_(self, fnm):
|
25 |
+
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
|
26 |
+
try:
|
27 |
+
of = open(fnm, "r")
|
28 |
+
while True:
|
29 |
+
line = of.readline()
|
30 |
+
if not line:
|
31 |
+
break
|
32 |
+
line = re.sub(r"[\r\n]+", "", line)
|
33 |
+
line = re.split(r"[ \t]", line)
|
34 |
+
k = self.key_(line[0])
|
35 |
+
F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
|
36 |
+
if k not in self.trie_ or self.trie_[k][0] < F:
|
37 |
+
self.trie_[self.key_(line[0])] = (F, line[2])
|
38 |
+
self.trie_[self.rkey_(line[0])] = 1
|
39 |
+
self.trie_.save(fnm + ".trie")
|
40 |
+
of.close()
|
41 |
+
except Exception as e:
|
42 |
+
print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
|
43 |
+
|
44 |
+
def __init__(self, debug=False):
|
45 |
+
self.DEBUG = debug
|
46 |
+
self.DENOMINATOR = 1000000
|
47 |
+
self.trie_ = datrie.Trie(string.printable)
|
48 |
+
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
|
49 |
+
|
50 |
+
self.stemmer = PorterStemmer()
|
51 |
+
self.lemmatizer = WordNetLemmatizer()
|
52 |
+
|
53 |
+
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
|
54 |
+
try:
|
55 |
+
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
56 |
+
return
|
57 |
+
except Exception as e:
|
58 |
+
print("[HUQIE]:Build default trie", file=sys.stderr)
|
59 |
+
self.trie_ = datrie.Trie(string.printable)
|
60 |
+
|
61 |
+
self.loadDict_(self.DIR_ + ".txt")
|
62 |
+
|
63 |
+
def loadUserDict(self, fnm):
|
64 |
+
try:
|
65 |
+
self.trie_ = datrie.Trie.load(fnm + ".trie")
|
66 |
+
return
|
67 |
+
except Exception as e:
|
68 |
+
self.trie_ = datrie.Trie(string.printable)
|
69 |
+
self.loadDict_(fnm)
|
70 |
+
|
71 |
+
def addUserDict(self, fnm):
|
72 |
+
self.loadDict_(fnm)
|
73 |
+
|
74 |
+
def _strQ2B(self, ustring):
|
75 |
+
"""把字符串全角转半角"""
|
76 |
+
rstring = ""
|
77 |
+
for uchar in ustring:
|
78 |
+
inside_code = ord(uchar)
|
79 |
+
if inside_code == 0x3000:
|
80 |
+
inside_code = 0x0020
|
81 |
+
else:
|
82 |
+
inside_code -= 0xfee0
|
83 |
+
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
|
84 |
+
rstring += uchar
|
85 |
+
else:
|
86 |
+
rstring += chr(inside_code)
|
87 |
+
return rstring
|
88 |
+
|
89 |
+
def _tradi2simp(self, line):
|
90 |
+
return HanziConv.toSimplified(line)
|
91 |
+
|
92 |
+
def dfs_(self, chars, s, preTks, tkslist):
|
93 |
+
MAX_L = 10
|
94 |
+
res = s
|
95 |
+
# if s > MAX_L or s>= len(chars):
|
96 |
+
if s >= len(chars):
|
97 |
+
tkslist.append(preTks)
|
98 |
+
return res
|
99 |
+
|
100 |
+
# pruning
|
101 |
+
S = s + 1
|
102 |
+
if s + 2 <= len(chars):
|
103 |
+
t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
|
104 |
+
if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
|
105 |
+
self.key_(t2)):
|
106 |
+
S = s + 2
|
107 |
+
if len(preTks) > 2 and len(
|
108 |
+
preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
|
109 |
+
t1 = preTks[-1][0] + "".join(chars[s:s + 1])
|
110 |
+
if self.trie_.has_keys_with_prefix(self.key_(t1)):
|
111 |
+
S = s + 2
|
112 |
+
|
113 |
+
################
|
114 |
+
for e in range(S, len(chars) + 1):
|
115 |
+
t = "".join(chars[s:e])
|
116 |
+
k = self.key_(t)
|
117 |
+
|
118 |
+
if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
|
119 |
+
break
|
120 |
+
|
121 |
+
if k in self.trie_:
|
122 |
+
pretks = copy.deepcopy(preTks)
|
123 |
+
if k in self.trie_:
|
124 |
+
pretks.append((t, self.trie_[k]))
|
125 |
+
else:
|
126 |
+
pretks.append((t, (-12, '')))
|
127 |
+
res = max(res, self.dfs_(chars, e, pretks, tkslist))
|
128 |
+
|
129 |
+
if res > s:
|
130 |
+
return res
|
131 |
+
|
132 |
+
t = "".join(chars[s:s + 1])
|
133 |
+
k = self.key_(t)
|
134 |
+
if k in self.trie_:
|
135 |
+
preTks.append((t, self.trie_[k]))
|
136 |
+
else:
|
137 |
+
preTks.append((t, (-12, '')))
|
138 |
+
|
139 |
+
return self.dfs_(chars, s + 1, preTks, tkslist)
|
140 |
+
|
141 |
+
def freq(self, tk):
|
142 |
+
k = self.key_(tk)
|
143 |
+
if k not in self.trie_:
|
144 |
+
return 0
|
145 |
+
return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
|
146 |
+
|
147 |
+
def tag(self, tk):
|
148 |
+
k = self.key_(tk)
|
149 |
+
if k not in self.trie_:
|
150 |
+
return ""
|
151 |
+
return self.trie_[k][1]
|
152 |
+
|
153 |
+
def score_(self, tfts):
|
154 |
+
B = 30
|
155 |
+
F, L, tks = 0, 0, []
|
156 |
+
for tk, (freq, tag) in tfts:
|
157 |
+
F += freq
|
158 |
+
L += 0 if len(tk) < 2 else 1
|
159 |
+
tks.append(tk)
|
160 |
+
F /= len(tks)
|
161 |
+
L /= len(tks)
|
162 |
+
if self.DEBUG:
|
163 |
+
print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
|
164 |
+
return tks, B / len(tks) + L + F
|
165 |
+
|
166 |
+
def sortTks_(self, tkslist):
|
167 |
+
res = []
|
168 |
+
for tfts in tkslist:
|
169 |
+
tks, s = self.score_(tfts)
|
170 |
+
res.append((tks, s))
|
171 |
+
return sorted(res, key=lambda x: x[1], reverse=True)
|
172 |
+
|
173 |
+
def merge_(self, tks):
|
174 |
+
patts = [
|
175 |
+
(r"[ ]+", " "),
|
176 |
+
(r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
|
177 |
+
]
|
178 |
+
# for p,s in patts: tks = re.sub(p, s, tks)
|
179 |
+
|
180 |
+
# if split chars is part of token
|
181 |
+
res = []
|
182 |
+
tks = re.sub(r"[ ]+", " ", tks).split(" ")
|
183 |
+
s = 0
|
184 |
+
while True:
|
185 |
+
if s >= len(tks):
|
186 |
+
break
|
187 |
+
E = s + 1
|
188 |
+
for e in range(s + 2, min(len(tks) + 2, s + 6)):
|
189 |
+
tk = "".join(tks[s:e])
|
190 |
+
if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
|
191 |
+
E = e
|
192 |
+
res.append("".join(tks[s:E]))
|
193 |
+
s = E
|
194 |
+
|
195 |
+
return " ".join(res)
|
196 |
+
|
197 |
+
def maxForward_(self, line):
|
198 |
+
res = []
|
199 |
+
s = 0
|
200 |
+
while s < len(line):
|
201 |
+
e = s + 1
|
202 |
+
t = line[s:e]
|
203 |
+
while e < len(line) and self.trie_.has_keys_with_prefix(
|
204 |
+
self.key_(t)):
|
205 |
+
e += 1
|
206 |
+
t = line[s:e]
|
207 |
+
|
208 |
+
while e - 1 > s and self.key_(t) not in self.trie_:
|
209 |
+
e -= 1
|
210 |
+
t = line[s:e]
|
211 |
+
|
212 |
+
if self.key_(t) in self.trie_:
|
213 |
+
res.append((t, self.trie_[self.key_(t)]))
|
214 |
+
else:
|
215 |
+
res.append((t, (0, '')))
|
216 |
+
|
217 |
+
s = e
|
218 |
+
|
219 |
+
return self.score_(res)
|
220 |
+
|
221 |
+
def maxBackward_(self, line):
|
222 |
+
res = []
|
223 |
+
s = len(line) - 1
|
224 |
+
while s >= 0:
|
225 |
+
e = s + 1
|
226 |
+
t = line[s:e]
|
227 |
+
while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
|
228 |
+
s -= 1
|
229 |
+
t = line[s:e]
|
230 |
+
|
231 |
+
while s + 1 < e and self.key_(t) not in self.trie_:
|
232 |
+
s += 1
|
233 |
+
t = line[s:e]
|
234 |
+
|
235 |
+
if self.key_(t) in self.trie_:
|
236 |
+
res.append((t, self.trie_[self.key_(t)]))
|
237 |
+
else:
|
238 |
+
res.append((t, (0, '')))
|
239 |
+
|
240 |
+
s -= 1
|
241 |
+
|
242 |
+
return self.score_(res[::-1])
|
243 |
+
|
244 |
+
def tokenize(self, line):
|
245 |
+
line = self._strQ2B(line).lower()
|
246 |
+
line = self._tradi2simp(line)
|
247 |
+
zh_num = len([1 for c in line if is_chinese(c)])
|
248 |
+
if zh_num < len(line) * 0.2:
|
249 |
+
return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
|
250 |
+
|
251 |
+
arr = re.split(self.SPLIT_CHAR, line)
|
252 |
+
res = []
|
253 |
+
for L in arr:
|
254 |
+
if len(L) < 2 or re.match(
|
255 |
+
r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
|
256 |
+
res.append(L)
|
257 |
+
continue
|
258 |
+
# print(L)
|
259 |
+
|
260 |
+
# use maxforward for the first time
|
261 |
+
tks, s = self.maxForward_(L)
|
262 |
+
tks1, s1 = self.maxBackward_(L)
|
263 |
+
if self.DEBUG:
|
264 |
+
print("[FW]", tks, s)
|
265 |
+
print("[BW]", tks1, s1)
|
266 |
+
|
267 |
+
diff = [0 for _ in range(max(len(tks1), len(tks)))]
|
268 |
+
for i in range(min(len(tks1), len(tks))):
|
269 |
+
if tks[i] != tks1[i]:
|
270 |
+
diff[i] = 1
|
271 |
+
|
272 |
+
if s1 > s:
|
273 |
+
tks = tks1
|
274 |
+
|
275 |
+
i = 0
|
276 |
+
while i < len(tks):
|
277 |
+
s = i
|
278 |
+
while s < len(tks) and diff[s] == 0:
|
279 |
+
s += 1
|
280 |
+
if s == len(tks):
|
281 |
+
res.append(" ".join(tks[i:]))
|
282 |
+
break
|
283 |
+
if s > i:
|
284 |
+
res.append(" ".join(tks[i:s]))
|
285 |
+
|
286 |
+
e = s
|
287 |
+
while e < len(tks) and e - s < 5 and diff[e] == 1:
|
288 |
+
e += 1
|
289 |
+
|
290 |
+
tkslist = []
|
291 |
+
self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
|
292 |
+
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
293 |
+
|
294 |
+
i = e + 1
|
295 |
+
|
296 |
+
res = " ".join(res)
|
297 |
+
if self.DEBUG:
|
298 |
+
print("[TKS]", self.merge_(res))
|
299 |
+
return self.merge_(res)
|
300 |
+
|
301 |
+
def fine_grained_tokenize(self, tks):
|
302 |
+
tks = tks.split(" ")
|
303 |
+
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
304 |
+
if zh_num < len(tks) * 0.2:
|
305 |
+
res = []
|
306 |
+
for tk in tks:
|
307 |
+
res.extend(tk.split("/"))
|
308 |
+
return " ".join(res)
|
309 |
+
|
310 |
+
res = []
|
311 |
+
for tk in tks:
|
312 |
+
if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
|
313 |
+
res.append(tk)
|
314 |
+
continue
|
315 |
+
tkslist = []
|
316 |
+
if len(tk) > 10:
|
317 |
+
tkslist.append(tk)
|
318 |
+
else:
|
319 |
+
self.dfs_(tk, 0, [], tkslist)
|
320 |
+
if len(tkslist) < 2:
|
321 |
+
res.append(tk)
|
322 |
+
continue
|
323 |
+
stk = self.sortTks_(tkslist)[1][0]
|
324 |
+
if len(stk) == len(tk):
|
325 |
+
stk = tk
|
326 |
+
else:
|
327 |
+
if re.match(r"[a-z\.-]+$", tk):
|
328 |
+
for t in stk:
|
329 |
+
if len(t) < 3:
|
330 |
+
stk = tk
|
331 |
+
break
|
332 |
+
else:
|
333 |
+
stk = " ".join(stk)
|
334 |
+
else:
|
335 |
+
stk = " ".join(stk)
|
336 |
+
|
337 |
+
res.append(stk)
|
338 |
+
|
339 |
+
return " ".join(res)
|
340 |
+
|
341 |
+
|
342 |
+
def is_chinese(s):
|
343 |
+
if s >= u'\u4e00' and s <= u'\u9fa5':
|
344 |
+
return True
|
345 |
+
else:
|
346 |
+
return False
|
347 |
+
|
348 |
+
|
349 |
+
def is_number(s):
|
350 |
+
if s >= u'\u0030' and s <= u'\u0039':
|
351 |
+
return True
|
352 |
+
else:
|
353 |
+
return False
|
354 |
+
|
355 |
+
|
356 |
+
def is_alphabet(s):
|
357 |
+
if (s >= u'\u0041' and s <= u'\u005a') or (
|
358 |
+
s >= u'\u0061' and s <= u'\u007a'):
|
359 |
+
return True
|
360 |
+
else:
|
361 |
+
return False
|
362 |
+
|
363 |
+
|
364 |
+
def naiveQie(txt):
|
365 |
+
tks = []
|
366 |
+
for t in txt.split(" "):
|
367 |
+
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
|
368 |
+
) and re.match(r".*[a-zA-Z]$", t):
|
369 |
+
tks.append(" ")
|
370 |
+
tks.append(t)
|
371 |
+
return tks
|
372 |
+
|
373 |
+
|
374 |
+
tokenizer = RagTokenizer()
|
375 |
+
tokenize = tokenizer.tokenize
|
376 |
+
fine_grained_tokenize = tokenizer.fine_grained_tokenize
|
377 |
+
tag = tokenizer.tag
|
378 |
+
freq = tokenizer.freq
|
379 |
+
loadUserDict = tokenizer.loadUserDict
|
380 |
+
addUserDict = tokenizer.addUserDict
|
381 |
+
tradi2simp = tokenizer._tradi2simp
|
382 |
+
strQ2B = tokenizer._strQ2B
|
383 |
+
|
384 |
+
if __name__ == '__main__':
|
385 |
+
tknzr = RagTokenizer(debug=True)
|
386 |
+
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
387 |
+
tks = tknzr.tokenize(
|
388 |
+
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
389 |
+
print(tknzr.fine_grained_tokenize(tks))
|
390 |
+
tks = tknzr.tokenize(
|
391 |
+
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
392 |
+
print(tknzr.fine_grained_tokenize(tks))
|
393 |
+
tks = tknzr.tokenize(
|
394 |
+
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
395 |
+
print(tknzr.fine_grained_tokenize(tks))
|
396 |
+
tks = tknzr.tokenize(
|
397 |
+
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
398 |
+
print(tknzr.fine_grained_tokenize(tks))
|
399 |
+
tks = tknzr.tokenize("虽然我不怎么玩")
|
400 |
+
print(tknzr.fine_grained_tokenize(tks))
|
401 |
+
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
402 |
+
print(tknzr.fine_grained_tokenize(tks))
|
403 |
+
tks = tknzr.tokenize(
|
404 |
+
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
405 |
+
print(tknzr.fine_grained_tokenize(tks))
|
406 |
+
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
407 |
+
print(tknzr.fine_grained_tokenize(tks))
|
408 |
+
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
409 |
+
print(tknzr.fine_grained_tokenize(tks))
|
410 |
+
tks = tknzr.tokenize(
|
411 |
+
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
412 |
+
print(tknzr.fine_grained_tokenize(tks))
|
413 |
+
if len(sys.argv) < 2:
|
414 |
+
sys.exit()
|
415 |
+
tknzr.DEBUG = False
|
416 |
+
tknzr.loadUserDict(sys.argv[1])
|
417 |
+
of = open(sys.argv[2], "r")
|
418 |
+
while True:
|
419 |
+
line = of.readline()
|
420 |
+
if not line:
|
421 |
+
break
|
422 |
+
print(tknzr.tokenize(line))
|
423 |
+
of.close()
|
rag/nlp/search.py
CHANGED
@@ -9,7 +9,7 @@ from dataclasses import dataclass
|
|
9 |
|
10 |
from rag.settings import es_logger
|
11 |
from rag.utils import rmSpace
|
12 |
-
from rag.nlp import
|
13 |
import numpy as np
|
14 |
|
15 |
|
@@ -128,7 +128,7 @@ class Dealer:
|
|
128 |
kwds = set([])
|
129 |
for k in keywords:
|
130 |
kwds.add(k)
|
131 |
-
for kk in
|
132 |
if len(kk) < 2:
|
133 |
continue
|
134 |
if kk in kwds:
|
@@ -243,7 +243,7 @@ class Dealer:
|
|
243 |
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
244 |
len(ans_v[0]), len(chunk_v[0]))
|
245 |
|
246 |
-
chunks_tks = [
|
247 |
for ck in chunks]
|
248 |
cites = {}
|
249 |
thr = 0.63
|
@@ -251,7 +251,7 @@ class Dealer:
|
|
251 |
for i, a in enumerate(pieces_):
|
252 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
253 |
chunk_v,
|
254 |
-
|
255 |
self.qryr.rmWWW(pieces_[i])).split(" "),
|
256 |
chunks_tks,
|
257 |
tkweight, vtweight)
|
@@ -310,8 +310,8 @@ class Dealer:
|
|
310 |
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
311 |
return self.qryr.hybrid_similarity(ans_embd,
|
312 |
ins_embd,
|
313 |
-
|
314 |
-
|
315 |
|
316 |
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
317 |
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
@@ -385,7 +385,7 @@ class Dealer:
|
|
385 |
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
386 |
fld, v = r.group(1), r.group(3)
|
387 |
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
388 |
-
fld,
|
389 |
replaces.append(
|
390 |
("{}{}'{}'".format(
|
391 |
r.group(1),
|
|
|
9 |
|
10 |
from rag.settings import es_logger
|
11 |
from rag.utils import rmSpace
|
12 |
+
from rag.nlp import rag_tokenizer, query
|
13 |
import numpy as np
|
14 |
|
15 |
|
|
|
128 |
kwds = set([])
|
129 |
for k in keywords:
|
130 |
kwds.add(k)
|
131 |
+
for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
|
132 |
if len(kk) < 2:
|
133 |
continue
|
134 |
if kk in kwds:
|
|
|
243 |
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
244 |
len(ans_v[0]), len(chunk_v[0]))
|
245 |
|
246 |
+
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
|
247 |
for ck in chunks]
|
248 |
cites = {}
|
249 |
thr = 0.63
|
|
|
251 |
for i, a in enumerate(pieces_):
|
252 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
253 |
chunk_v,
|
254 |
+
rag_tokenizer.tokenize(
|
255 |
self.qryr.rmWWW(pieces_[i])).split(" "),
|
256 |
chunks_tks,
|
257 |
tkweight, vtweight)
|
|
|
310 |
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
311 |
return self.qryr.hybrid_similarity(ans_embd,
|
312 |
ins_embd,
|
313 |
+
rag_tokenizer.tokenize(ans).split(" "),
|
314 |
+
rag_tokenizer.tokenize(inst).split(" "))
|
315 |
|
316 |
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
317 |
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
|
|
385 |
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
386 |
fld, v = r.group(1), r.group(3)
|
387 |
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
388 |
+
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
|
389 |
replaces.append(
|
390 |
("{}{}'{}'".format(
|
391 |
r.group(1),
|
rag/nlp/term_weight.py
CHANGED
@@ -4,7 +4,7 @@ import json
|
|
4 |
import re
|
5 |
import os
|
6 |
import numpy as np
|
7 |
-
from rag.nlp import
|
8 |
from api.utils.file_utils import get_project_base_directory
|
9 |
|
10 |
|
@@ -83,7 +83,7 @@ class Dealer:
|
|
83 |
txt = re.sub(p, r, txt)
|
84 |
|
85 |
res = []
|
86 |
-
for t in
|
87 |
tk = t
|
88 |
if (stpwd and tk in self.stop_words) or (
|
89 |
re.match(r"[0-9]$", tk) and not num):
|
@@ -161,7 +161,7 @@ class Dealer:
|
|
161 |
return m[self.ne[t]]
|
162 |
|
163 |
def postag(t):
|
164 |
-
t =
|
165 |
if t in set(["r", "c", "d"]):
|
166 |
return 0.3
|
167 |
if t in set(["ns", "nt"]):
|
@@ -175,14 +175,14 @@ class Dealer:
|
|
175 |
def freq(t):
|
176 |
if re.match(r"[0-9. -]{2,}$", t):
|
177 |
return 3
|
178 |
-
s =
|
179 |
if not s and re.match(r"[a-z. -]+$", t):
|
180 |
return 300
|
181 |
if not s:
|
182 |
s = 0
|
183 |
|
184 |
if not s and len(t) >= 4:
|
185 |
-
s = [tt for tt in
|
186 |
if len(s) > 1:
|
187 |
s = np.min([freq(tt) for tt in s]) / 6.
|
188 |
else:
|
@@ -198,7 +198,7 @@ class Dealer:
|
|
198 |
elif re.match(r"[a-z. -]+$", t):
|
199 |
return 300
|
200 |
elif len(t) >= 4:
|
201 |
-
s = [tt for tt in
|
202 |
if len(s) > 1:
|
203 |
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
204 |
|
|
|
4 |
import re
|
5 |
import os
|
6 |
import numpy as np
|
7 |
+
from rag.nlp import rag_tokenizer
|
8 |
from api.utils.file_utils import get_project_base_directory
|
9 |
|
10 |
|
|
|
83 |
txt = re.sub(p, r, txt)
|
84 |
|
85 |
res = []
|
86 |
+
for t in rag_tokenizer.tokenize(txt).split(" "):
|
87 |
tk = t
|
88 |
if (stpwd and tk in self.stop_words) or (
|
89 |
re.match(r"[0-9]$", tk) and not num):
|
|
|
161 |
return m[self.ne[t]]
|
162 |
|
163 |
def postag(t):
|
164 |
+
t = rag_tokenizer.tag(t)
|
165 |
if t in set(["r", "c", "d"]):
|
166 |
return 0.3
|
167 |
if t in set(["ns", "nt"]):
|
|
|
175 |
def freq(t):
|
176 |
if re.match(r"[0-9. -]{2,}$", t):
|
177 |
return 3
|
178 |
+
s = rag_tokenizer.freq(t)
|
179 |
if not s and re.match(r"[a-z. -]+$", t):
|
180 |
return 300
|
181 |
if not s:
|
182 |
s = 0
|
183 |
|
184 |
if not s and len(t) >= 4:
|
185 |
+
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
186 |
if len(s) > 1:
|
187 |
s = np.min([freq(tt) for tt in s]) / 6.
|
188 |
else:
|
|
|
198 |
elif re.match(r"[a-z. -]+$", t):
|
199 |
return 300
|
200 |
elif len(t) >= 4:
|
201 |
+
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
202 |
if len(s) > 1:
|
203 |
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
204 |
|
rag/svr/cache_file_svr.py
CHANGED
@@ -4,13 +4,14 @@ import traceback
|
|
4 |
|
5 |
from api.db.db_models import close_connection
|
6 |
from api.db.services.task_service import TaskService
|
|
|
7 |
from rag.utils.minio_conn import MINIO
|
8 |
from rag.utils.redis_conn import REDIS_CONN
|
9 |
|
10 |
|
11 |
def collect():
|
12 |
doc_locations = TaskService.get_ongoing_doc_name()
|
13 |
-
|
14 |
if len(doc_locations) == 0:
|
15 |
time.sleep(1)
|
16 |
return
|
@@ -28,7 +29,7 @@ def main():
|
|
28 |
if REDIS_CONN.exist(key):continue
|
29 |
file_bin = MINIO.get(kb_id, loc)
|
30 |
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
31 |
-
|
32 |
except Exception as e:
|
33 |
traceback.print_stack(e)
|
34 |
except Exception as e:
|
|
|
4 |
|
5 |
from api.db.db_models import close_connection
|
6 |
from api.db.services.task_service import TaskService
|
7 |
+
from rag.settings import cron_logger
|
8 |
from rag.utils.minio_conn import MINIO
|
9 |
from rag.utils.redis_conn import REDIS_CONN
|
10 |
|
11 |
|
12 |
def collect():
|
13 |
doc_locations = TaskService.get_ongoing_doc_name()
|
14 |
+
print(doc_locations)
|
15 |
if len(doc_locations) == 0:
|
16 |
time.sleep(1)
|
17 |
return
|
|
|
29 |
if REDIS_CONN.exist(key):continue
|
30 |
file_bin = MINIO.get(kb_id, loc)
|
31 |
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
32 |
+
cron_logger.info("CACHE: {}".format(loc))
|
33 |
except Exception as e:
|
34 |
traceback.print_stack(e)
|
35 |
except Exception as e:
|
rag/svr/task_broker.py
CHANGED
@@ -21,7 +21,6 @@ from datetime import datetime
|
|
21 |
from api.db.db_models import Task
|
22 |
from api.db.db_utils import bulk_insert_into_db
|
23 |
from api.db.services.file2document_service import File2DocumentService
|
24 |
-
from api.db.services.file_service import FileService
|
25 |
from api.db.services.task_service import TaskService
|
26 |
from deepdoc.parser import PdfParser
|
27 |
from deepdoc.parser.excel_parser import RAGFlowExcelParser
|
|
|
21 |
from api.db.db_models import Task
|
22 |
from api.db.db_utils import bulk_insert_into_db
|
23 |
from api.db.services.file2document_service import File2DocumentService
|
|
|
24 |
from api.db.services.task_service import TaskService
|
25 |
from deepdoc.parser import PdfParser
|
26 |
from deepdoc.parser.excel_parser import RAGFlowExcelParser
|