Kevin Hu commited on
Commit
58f507b
·
1 Parent(s): 49f80bf

Refine english synonym (#3371)

Browse files

### What problem does this PR solve?

#3361

### Type of change

- [x] Performance Improvement

agent/component/base.py CHANGED
@@ -446,9 +446,22 @@ class ComponentBase(ABC):
446
  outs = []
447
  for q in self._param.query:
448
  if q["component_id"]:
 
 
 
 
 
 
 
 
 
 
 
 
449
  outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1])
450
  self._param.inputs.append({"component_id": q["component_id"],
451
- "content": "\n".join([str(d["content"]) for d in outs[-1].to_dict('records')])})
 
452
  elif q["value"]:
453
  self._param.inputs.append({"component_id": None, "content": q["value"]})
454
  outs.append(pd.DataFrame([{"content": q["value"]}]))
 
446
  outs = []
447
  for q in self._param.query:
448
  if q["component_id"]:
449
+ if q["component_id"].split("@")[0].lower().find("begin") > 0:
450
+ cpn_id, key = q["component_id"].split("@")
451
+ for p in self._canvas.get_component(cpn_id)["obj"]._param.query:
452
+ if p["key"] == key:
453
+ outs.append(pd.DataFrame([{"content": p["value"]}]))
454
+ self._param.inputs.append({"component_id": q["component_id"],
455
+ "content": p["value"]})
456
+ break
457
+ else:
458
+ assert False, f"Can't find parameter '{key}' for {cpn_id}"
459
+ continue
460
+
461
  outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1])
462
  self._param.inputs.append({"component_id": q["component_id"],
463
+ "content": "\n".join(
464
+ [str(d["content"]) for d in outs[-1].to_dict('records')])})
465
  elif q["value"]:
466
  self._param.inputs.append({"component_id": None, "content": q["value"]})
467
  outs.append(pd.DataFrame([{"content": q["value"]}]))
agent/component/generate.py CHANGED
@@ -104,6 +104,18 @@ class Generate(ComponentBase):
104
  retrieval_res = []
105
  self._param.inputs = []
106
  for para in self._param.parameters:
 
 
 
 
 
 
 
 
 
 
 
 
107
  cpn = self._canvas.get_component(para["component_id"])["obj"]
108
  if cpn.component_name.lower() == "answer":
109
  kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"]
 
104
  retrieval_res = []
105
  self._param.inputs = []
106
  for para in self._param.parameters:
107
+ if para["component_id"].split("@")[0].lower().find("begin") > 0:
108
+ cpn_id, key = para["component_id"].split("@")
109
+ for p in self._canvas.get_component(cpn_id)["obj"]._param.query:
110
+ if p["key"] == key:
111
+ kwargs[para["key"]] = p["value"]
112
+ self._param.inputs.append(
113
+ {"component_id": para["component_id"], "content": kwargs[para["key"]]})
114
+ break
115
+ else:
116
+ assert False, f"Can't find parameter '{key}' for {cpn_id}"
117
+ continue
118
+
119
  cpn = self._canvas.get_component(para["component_id"])["obj"]
120
  if cpn.component_name.lower() == "answer":
121
  kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"]
api/apps/document_app.py CHANGED
@@ -25,6 +25,7 @@ from api.db.services.file2document_service import File2DocumentService
25
  from api.db.services.file_service import FileService
26
  from api.db.services.task_service import TaskService, queue_tasks
27
  from api.db.services.user_service import UserTenantService
 
28
  from rag.nlp import search
29
  from api.db.services import duplicate_name
30
  from api.db.services.knowledgebase_service import KnowledgebaseService
@@ -518,3 +519,32 @@ def upload_and_parse():
518
  doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
519
 
520
  return get_json_result(data=doc_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  from api.db.services.file_service import FileService
26
  from api.db.services.task_service import TaskService, queue_tasks
27
  from api.db.services.user_service import UserTenantService
28
+ from deepdoc.parser.html_parser import RAGFlowHtmlParser
29
  from rag.nlp import search
30
  from api.db.services import duplicate_name
31
  from api.db.services.knowledgebase_service import KnowledgebaseService
 
519
  doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
520
 
521
  return get_json_result(data=doc_ids)
522
+
523
+
524
+ @manager.route('/parse', methods=['POST'])
525
+ @login_required
526
+ def parse():
527
+ url = request.json.get("url")
528
+ if url:
529
+ if not is_valid_url(url):
530
+ return get_json_result(
531
+ data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
532
+ from selenium.webdriver import Chrome, ChromeOptions
533
+ options = ChromeOptions()
534
+ options.add_argument('--headless')
535
+ options.add_argument('--disable-gpu')
536
+ options.add_argument('--no-sandbox')
537
+ options.add_argument('--disable-dev-shm-usage')
538
+ driver = Chrome(options=options)
539
+ driver.get(url)
540
+ sections = RAGFlowHtmlParser()(driver.page_source)
541
+ return get_json_result(data="\n".join(sections))
542
+
543
+ if 'file' not in request.files:
544
+ return get_json_result(
545
+ data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR)
546
+
547
+ file_objs = request.files.getlist('file')
548
+ txt = FileService.parse_docs(file_objs, current_user.id)
549
+
550
+ return get_json_result(data=txt)
rag/nlp/query.py CHANGED
@@ -75,11 +75,20 @@ class FulltextQueryer:
75
  if not self.isChinese(txt):
76
  txt = FulltextQueryer.rmWWW(txt)
77
  tks = rag_tokenizer.tokenize(txt).split(" ")
78
- tks_w = self.tw.weights(tks)
 
79
  tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
80
  tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
81
  tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
82
- q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk]
 
 
 
 
 
 
 
 
83
  for i in range(1, len(tks_w)):
84
  q.append(
85
  '"%s %s"^%.4f'
@@ -94,7 +103,7 @@ class FulltextQueryer:
94
  query = " ".join(q)
95
  return MatchTextExpr(
96
  self.query_fields, query, 100
97
- ), tks
98
 
99
  def need_fine_grained_tokenize(tk):
100
  if len(tk) < 3:
 
75
  if not self.isChinese(txt):
76
  txt = FulltextQueryer.rmWWW(txt)
77
  tks = rag_tokenizer.tokenize(txt).split(" ")
78
+ keywords = [t for t in tks if t]
79
+ tks_w = self.tw.weights(tks, preprocess=False)
80
  tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
81
  tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk]
82
  tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
83
+ syns = []
84
+ for tk, w in tks_w:
85
+ syn = self.syn.lookup(tk)
86
+ syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
87
+ keywords.extend(syn)
88
+ syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
89
+ syns.append(" ".join(syn))
90
+
91
+ q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
92
  for i in range(1, len(tks_w)):
93
  q.append(
94
  '"%s %s"^%.4f'
 
103
  query = " ".join(q)
104
  return MatchTextExpr(
105
  self.query_fields, query, 100
106
+ ), keywords
107
 
108
  def need_fine_grained_tokenize(tk):
109
  if len(tk) < 3:
rag/nlp/synonym.py CHANGED
@@ -18,7 +18,7 @@ import json
18
  import os
19
  import time
20
  import re
21
-
22
  from api.utils.file_utils import get_project_base_directory
23
  from api.utils.log_utils import logger
24
 
@@ -67,6 +67,10 @@ class Dealer:
67
  logger.error("Fail to load synonym!" + str(e))
68
 
69
  def lookup(self, tk):
 
 
 
 
70
  self.lookup_num += 1
71
  self.load()
72
  res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])
 
18
  import os
19
  import time
20
  import re
21
+ from nltk.corpus import wordnet
22
  from api.utils.file_utils import get_project_base_directory
23
  from api.utils.log_utils import logger
24
 
 
67
  logger.error("Fail to load synonym!" + str(e))
68
 
69
  def lookup(self, tk):
70
+ if re.match(r"[a-z]+$", tk):
71
+ res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets("love")]) - set([tk]))
72
+ return [t for t in res if t]
73
+
74
  self.lookup_num += 1
75
  self.load()
76
  res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])