Kevin Hu
commited on
Commit
·
58f507b
1
Parent(s):
49f80bf
Refine english synonym (#3371)
Browse files### What problem does this PR solve?
#3361
### Type of change
- [x] Performance Improvement
- agent/component/base.py +14 -1
- agent/component/generate.py +12 -0
- api/apps/document_app.py +30 -0
- rag/nlp/query.py +12 -3
- rag/nlp/synonym.py +5 -1
agent/component/base.py
CHANGED
@@ -446,9 +446,22 @@ class ComponentBase(ABC):
|
|
446 |
outs = []
|
447 |
for q in self._param.query:
|
448 |
if q["component_id"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1])
|
450 |
self._param.inputs.append({"component_id": q["component_id"],
|
451 |
-
"content": "\n".join(
|
|
|
452 |
elif q["value"]:
|
453 |
self._param.inputs.append({"component_id": None, "content": q["value"]})
|
454 |
outs.append(pd.DataFrame([{"content": q["value"]}]))
|
|
|
446 |
outs = []
|
447 |
for q in self._param.query:
|
448 |
if q["component_id"]:
|
449 |
+
if q["component_id"].split("@")[0].lower().find("begin") > 0:
|
450 |
+
cpn_id, key = q["component_id"].split("@")
|
451 |
+
for p in self._canvas.get_component(cpn_id)["obj"]._param.query:
|
452 |
+
if p["key"] == key:
|
453 |
+
outs.append(pd.DataFrame([{"content": p["value"]}]))
|
454 |
+
self._param.inputs.append({"component_id": q["component_id"],
|
455 |
+
"content": p["value"]})
|
456 |
+
break
|
457 |
+
else:
|
458 |
+
assert False, f"Can't find parameter '{key}' for {cpn_id}"
|
459 |
+
continue
|
460 |
+
|
461 |
outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1])
|
462 |
self._param.inputs.append({"component_id": q["component_id"],
|
463 |
+
"content": "\n".join(
|
464 |
+
[str(d["content"]) for d in outs[-1].to_dict('records')])})
|
465 |
elif q["value"]:
|
466 |
self._param.inputs.append({"component_id": None, "content": q["value"]})
|
467 |
outs.append(pd.DataFrame([{"content": q["value"]}]))
|
agent/component/generate.py
CHANGED
@@ -104,6 +104,18 @@ class Generate(ComponentBase):
|
|
104 |
retrieval_res = []
|
105 |
self._param.inputs = []
|
106 |
for para in self._param.parameters:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
cpn = self._canvas.get_component(para["component_id"])["obj"]
|
108 |
if cpn.component_name.lower() == "answer":
|
109 |
kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"]
|
|
|
104 |
retrieval_res = []
|
105 |
self._param.inputs = []
|
106 |
for para in self._param.parameters:
|
107 |
+
if para["component_id"].split("@")[0].lower().find("begin") > 0:
|
108 |
+
cpn_id, key = para["component_id"].split("@")
|
109 |
+
for p in self._canvas.get_component(cpn_id)["obj"]._param.query:
|
110 |
+
if p["key"] == key:
|
111 |
+
kwargs[para["key"]] = p["value"]
|
112 |
+
self._param.inputs.append(
|
113 |
+
{"component_id": para["component_id"], "content": kwargs[para["key"]]})
|
114 |
+
break
|
115 |
+
else:
|
116 |
+
assert False, f"Can't find parameter '{key}' for {cpn_id}"
|
117 |
+
continue
|
118 |
+
|
119 |
cpn = self._canvas.get_component(para["component_id"])["obj"]
|
120 |
if cpn.component_name.lower() == "answer":
|
121 |
kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"]
|
api/apps/document_app.py
CHANGED
@@ -25,6 +25,7 @@ from api.db.services.file2document_service import File2DocumentService
|
|
25 |
from api.db.services.file_service import FileService
|
26 |
from api.db.services.task_service import TaskService, queue_tasks
|
27 |
from api.db.services.user_service import UserTenantService
|
|
|
28 |
from rag.nlp import search
|
29 |
from api.db.services import duplicate_name
|
30 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
@@ -518,3 +519,32 @@ def upload_and_parse():
|
|
518 |
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
|
519 |
|
520 |
return get_json_result(data=doc_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
from api.db.services.file_service import FileService
|
26 |
from api.db.services.task_service import TaskService, queue_tasks
|
27 |
from api.db.services.user_service import UserTenantService
|
28 |
+
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
29 |
from rag.nlp import search
|
30 |
from api.db.services import duplicate_name
|
31 |
from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
|
519 |
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
|
520 |
|
521 |
return get_json_result(data=doc_ids)
|
522 |
+
|
523 |
+
|
524 |
+
@manager.route('/parse', methods=['POST'])
|
525 |
+
@login_required
|
526 |
+
def parse():
|
527 |
+
url = request.json.get("url")
|
528 |
+
if url:
|
529 |
+
if not is_valid_url(url):
|
530 |
+
return get_json_result(
|
531 |
+
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
532 |
+
from selenium.webdriver import Chrome, ChromeOptions
|
533 |
+
options = ChromeOptions()
|
534 |
+
options.add_argument('--headless')
|
535 |
+
options.add_argument('--disable-gpu')
|
536 |
+
options.add_argument('--no-sandbox')
|
537 |
+
options.add_argument('--disable-dev-shm-usage')
|
538 |
+
driver = Chrome(options=options)
|
539 |
+
driver.get(url)
|
540 |
+
sections = RAGFlowHtmlParser()(driver.page_source)
|
541 |
+
return get_json_result(data="\n".join(sections))
|
542 |
+
|
543 |
+
if 'file' not in request.files:
|
544 |
+
return get_json_result(
|
545 |
+
data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR)
|
546 |
+
|
547 |
+
file_objs = request.files.getlist('file')
|
548 |
+
txt = FileService.parse_docs(file_objs, current_user.id)
|
549 |
+
|
550 |
+
return get_json_result(data=txt)
|
rag/nlp/query.py
CHANGED
@@ -75,11 +75,20 @@ class FulltextQueryer:
|
|
75 |
if not self.isChinese(txt):
|
76 |
txt = FulltextQueryer.rmWWW(txt)
|
77 |
tks = rag_tokenizer.tokenize(txt).split(" ")
|
78 |
-
|
|
|
79 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
80 |
tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
|
81 |
tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
for i in range(1, len(tks_w)):
|
84 |
q.append(
|
85 |
'"%s %s"^%.4f'
|
@@ -94,7 +103,7 @@ class FulltextQueryer:
|
|
94 |
query = " ".join(q)
|
95 |
return MatchTextExpr(
|
96 |
self.query_fields, query, 100
|
97 |
-
),
|
98 |
|
99 |
def need_fine_grained_tokenize(tk):
|
100 |
if len(tk) < 3:
|
|
|
75 |
if not self.isChinese(txt):
|
76 |
txt = FulltextQueryer.rmWWW(txt)
|
77 |
tks = rag_tokenizer.tokenize(txt).split(" ")
|
78 |
+
keywords = [t for t in tks if t]
|
79 |
+
tks_w = self.tw.weights(tks, preprocess=False)
|
80 |
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
|
81 |
tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
|
82 |
tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
|
83 |
+
syns = []
|
84 |
+
for tk, w in tks_w:
|
85 |
+
syn = self.syn.lookup(tk)
|
86 |
+
syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
|
87 |
+
keywords.extend(syn)
|
88 |
+
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
|
89 |
+
syns.append(" ".join(syn))
|
90 |
+
|
91 |
+
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
|
92 |
for i in range(1, len(tks_w)):
|
93 |
q.append(
|
94 |
'"%s %s"^%.4f'
|
|
|
103 |
query = " ".join(q)
|
104 |
return MatchTextExpr(
|
105 |
self.query_fields, query, 100
|
106 |
+
), keywords
|
107 |
|
108 |
def need_fine_grained_tokenize(tk):
|
109 |
if len(tk) < 3:
|
rag/nlp/synonym.py
CHANGED
@@ -18,7 +18,7 @@ import json
|
|
18 |
import os
|
19 |
import time
|
20 |
import re
|
21 |
-
|
22 |
from api.utils.file_utils import get_project_base_directory
|
23 |
from api.utils.log_utils import logger
|
24 |
|
@@ -67,6 +67,10 @@ class Dealer:
|
|
67 |
logger.error("Fail to load synonym!" + str(e))
|
68 |
|
69 |
def lookup(self, tk):
|
|
|
|
|
|
|
|
|
70 |
self.lookup_num += 1
|
71 |
self.load()
|
72 |
res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])
|
|
|
18 |
import os
|
19 |
import time
|
20 |
import re
|
21 |
+
from nltk.corpus import wordnet
|
22 |
from api.utils.file_utils import get_project_base_directory
|
23 |
from api.utils.log_utils import logger
|
24 |
|
|
|
67 |
logger.error("Fail to load synonym!" + str(e))
|
68 |
|
69 |
def lookup(self, tk):
|
70 |
+
if re.match(r"[a-z]+$", tk):
|
71 |
+
res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets("love")]) - set([tk]))
|
72 |
+
return [t for t in res if t]
|
73 |
+
|
74 |
self.lookup_num += 1
|
75 |
self.load()
|
76 |
res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])
|