Kevin Hu
commited on
Commit
·
0d756a3
1
Parent(s):
c870c25
enlarge the default token length of RAPTOR summarization (#3454)
Browse files### What problem does this PR solve?
#3426
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/document_app.py +27 -5
- graphrag/search.py +1 -1
- poetry.lock +0 -0
- pyproject.toml +1 -0
- rag/raptor.py +1 -1
api/apps/document_app.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
|
|
16 |
import os.path
|
17 |
import pathlib
|
18 |
import re
|
@@ -533,7 +534,7 @@ def parse():
|
|
533 |
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
|
534 |
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
535 |
os.makedirs(download_path, exist_ok=True)
|
536 |
-
from
|
537 |
options = ChromeOptions()
|
538 |
options.add_argument('--headless')
|
539 |
options.add_argument('--disable-gpu')
|
@@ -547,10 +548,31 @@ def parse():
|
|
547 |
})
|
548 |
driver = Chrome(options=options)
|
549 |
driver.get(url)
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
|
555 |
if 'file' not in request.files:
|
556 |
return get_json_result(
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License
|
15 |
#
|
16 |
+
import json
|
17 |
import os.path
|
18 |
import pathlib
|
19 |
import re
|
|
|
534 |
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
|
535 |
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
536 |
os.makedirs(download_path, exist_ok=True)
|
537 |
+
from seleniumwire.webdriver import Chrome, ChromeOptions
|
538 |
options = ChromeOptions()
|
539 |
options.add_argument('--headless')
|
540 |
options.add_argument('--disable-gpu')
|
|
|
548 |
})
|
549 |
driver = Chrome(options=options)
|
550 |
driver.get(url)
|
551 |
+
res_headers = [r.response.headers for r in driver.requests]
|
552 |
+
if len(res_headers) > 1:
|
553 |
+
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
554 |
+
driver.quit()
|
555 |
+
return get_json_result(data="\n".join(sections))
|
556 |
+
|
557 |
+
class File:
|
558 |
+
filename: str
|
559 |
+
filepath: str
|
560 |
+
|
561 |
+
def __init__(self, filename, filepath):
|
562 |
+
self.filename = filename
|
563 |
+
self.filepath = filepath
|
564 |
+
|
565 |
+
def read(self):
|
566 |
+
with open(self.filepath, "r") as f:
|
567 |
+
return f.read()
|
568 |
+
|
569 |
+
r = re.search(r"filename=\"([^\"])\"", json.dumps(res_headers))
|
570 |
+
if not r or r.group(1):
|
571 |
+
return get_json_result(
|
572 |
+
data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
|
573 |
+
f = File(r.group(1), os.path.join(download_path, r.group(1)))
|
574 |
+
txt = FileService.parse_docs([f], current_user.id)
|
575 |
+
return get_json_result(data=txt)
|
576 |
|
577 |
if 'file' not in request.files:
|
578 |
return get_json_result(
|
graphrag/search.py
CHANGED
@@ -68,7 +68,7 @@ class KGSearch(Dealer):
|
|
68 |
|
69 |
ent_res = self.dataStore.search(src, list(), condition, [matchText, matchDense, fusionExpr], OrderByExpr(), 0, 32, idxnm, kb_ids)
|
70 |
ent_res_fields = self.dataStore.getFields(ent_res, src)
|
71 |
-
entities = [d["name_kwd"] for d in ent_res_fields.values()]
|
72 |
ent_ids = self.dataStore.getChunkIds(ent_res)
|
73 |
ent_content = merge_into_first(ent_res_fields, "-Entities-")
|
74 |
if ent_content:
|
|
|
68 |
|
69 |
ent_res = self.dataStore.search(src, list(), condition, [matchText, matchDense, fusionExpr], OrderByExpr(), 0, 32, idxnm, kb_ids)
|
70 |
ent_res_fields = self.dataStore.getFields(ent_res, src)
|
71 |
+
entities = [d.get["name_kwd"] for d in ent_res_fields.values() if d.get("name_kwd")]
|
72 |
ent_ids = self.dataStore.getChunkIds(ent_res)
|
73 |
ent_content = merge_into_first(ent_res_fields, "-Entities-")
|
74 |
if ent_content:
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -87,6 +87,7 @@ ruamel-base = "1.0.0"
|
|
87 |
scholarly = "1.7.11"
|
88 |
scikit-learn = "1.5.0"
|
89 |
selenium = "4.22.0"
|
|
|
90 |
setuptools = "^75.2.0"
|
91 |
shapely = "2.0.5"
|
92 |
six = "1.16.0"
|
|
|
87 |
scholarly = "1.7.11"
|
88 |
scikit-learn = "1.5.0"
|
89 |
selenium = "4.22.0"
|
90 |
+
selenium-wire = "5.1.0"
|
91 |
setuptools = "^75.2.0"
|
92 |
shapely = "2.0.5"
|
93 |
six = "1.16.0"
|
rag/raptor.py
CHANGED
@@ -26,7 +26,7 @@ from rag.utils import truncate
|
|
26 |
|
27 |
|
28 |
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
29 |
-
def __init__(self, max_cluster, llm_model, embd_model, prompt, max_token=
|
30 |
self._max_cluster = max_cluster
|
31 |
self._llm_model = llm_model
|
32 |
self._embd_model = embd_model
|
|
|
26 |
|
27 |
|
28 |
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
29 |
+
def __init__(self, max_cluster, llm_model, embd_model, prompt, max_token=512, threshold=0.1):
|
30 |
self._max_cluster = max_cluster
|
31 |
self._llm_model = llm_model
|
32 |
self._embd_model = embd_model
|