Kevin Hu commited on
Commit
0d756a3
·
1 Parent(s): c870c25

enlarge the default token length of RAPTOR summarization (#3454)

Browse files

### What problem does this PR solve?

#3426

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/document_app.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License
15
  #
 
16
  import os.path
17
  import pathlib
18
  import re
@@ -533,7 +534,7 @@ def parse():
533
  data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
534
  download_path = os.path.join(get_project_base_directory(), "logs/downloads")
535
  os.makedirs(download_path, exist_ok=True)
536
- from selenium.webdriver import Chrome, ChromeOptions
537
  options = ChromeOptions()
538
  options.add_argument('--headless')
539
  options.add_argument('--disable-gpu')
@@ -547,10 +548,31 @@ def parse():
547
  })
548
  driver = Chrome(options=options)
549
  driver.get(url)
550
- print(driver.get_downloadable_files())
551
- sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
552
- driver.close()
553
- return get_json_result(data="\n".join(sections))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
  if 'file' not in request.files:
556
  return get_json_result(
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License
15
  #
16
+ import json
17
  import os.path
18
  import pathlib
19
  import re
 
534
  data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
535
  download_path = os.path.join(get_project_base_directory(), "logs/downloads")
536
  os.makedirs(download_path, exist_ok=True)
537
+ from seleniumwire.webdriver import Chrome, ChromeOptions
538
  options = ChromeOptions()
539
  options.add_argument('--headless')
540
  options.add_argument('--disable-gpu')
 
548
  })
549
  driver = Chrome(options=options)
550
  driver.get(url)
551
+ res_headers = [r.response.headers for r in driver.requests]
552
+ if len(res_headers) > 1:
553
+ sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
554
+ driver.quit()
555
+ return get_json_result(data="\n".join(sections))
556
+
557
+ class File:
558
+ filename: str
559
+ filepath: str
560
+
561
+ def __init__(self, filename, filepath):
562
+ self.filename = filename
563
+ self.filepath = filepath
564
+
565
+ def read(self):
566
+ with open(self.filepath, "r") as f:
567
+ return f.read()
568
+
569
+ r = re.search(r"filename=\"([^\"])\"", json.dumps(res_headers))
570
+ if not r or r.group(1):
571
+ return get_json_result(
572
+ data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
573
+ f = File(r.group(1), os.path.join(download_path, r.group(1)))
574
+ txt = FileService.parse_docs([f], current_user.id)
575
+ return get_json_result(data=txt)
576
 
577
  if 'file' not in request.files:
578
  return get_json_result(
graphrag/search.py CHANGED
@@ -68,7 +68,7 @@ class KGSearch(Dealer):
68
 
69
  ent_res = self.dataStore.search(src, list(), condition, [matchText, matchDense, fusionExpr], OrderByExpr(), 0, 32, idxnm, kb_ids)
70
  ent_res_fields = self.dataStore.getFields(ent_res, src)
71
- entities = [d["name_kwd"] for d in ent_res_fields.values()]
72
  ent_ids = self.dataStore.getChunkIds(ent_res)
73
  ent_content = merge_into_first(ent_res_fields, "-Entities-")
74
  if ent_content:
 
68
 
69
  ent_res = self.dataStore.search(src, list(), condition, [matchText, matchDense, fusionExpr], OrderByExpr(), 0, 32, idxnm, kb_ids)
70
  ent_res_fields = self.dataStore.getFields(ent_res, src)
71
+ entities = [d.get["name_kwd"] for d in ent_res_fields.values() if d.get("name_kwd")]
72
  ent_ids = self.dataStore.getChunkIds(ent_res)
73
  ent_content = merge_into_first(ent_res_fields, "-Entities-")
74
  if ent_content:
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -87,6 +87,7 @@ ruamel-base = "1.0.0"
87
  scholarly = "1.7.11"
88
  scikit-learn = "1.5.0"
89
  selenium = "4.22.0"
 
90
  setuptools = "^75.2.0"
91
  shapely = "2.0.5"
92
  six = "1.16.0"
 
87
  scholarly = "1.7.11"
88
  scikit-learn = "1.5.0"
89
  selenium = "4.22.0"
90
+ selenium-wire = "5.1.0"
91
  setuptools = "^75.2.0"
92
  shapely = "2.0.5"
93
  six = "1.16.0"
rag/raptor.py CHANGED
@@ -26,7 +26,7 @@ from rag.utils import truncate
26
 
27
 
28
  class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
29
- def __init__(self, max_cluster, llm_model, embd_model, prompt, max_token=256, threshold=0.1):
30
  self._max_cluster = max_cluster
31
  self._llm_model = llm_model
32
  self._embd_model = embd_model
 
26
 
27
 
28
  class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
29
+ def __init__(self, max_cluster, llm_model, embd_model, prompt, max_token=512, threshold=0.1):
30
  self._max_cluster = max_cluster
31
  self._llm_model = llm_model
32
  self._embd_model = embd_model