Kevin Hu
commited on
Commit
·
fd3ed7b
1
Parent(s):
02e5242
Enlarge the term weight difference (#3435)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- api/apps/document_app.py +12 -1
- rag/nlp/query.py +1 -1
- rag/nlp/term_weight.py +2 -0
api/apps/document_app.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License
|
| 15 |
#
|
|
|
|
| 16 |
import pathlib
|
| 17 |
import re
|
| 18 |
|
|
@@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
|
|
| 36 |
from api.settings import RetCode, docStoreConn
|
| 37 |
from api.utils.api_utils import get_json_result
|
| 38 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 39 |
-
from api.utils.file_utils import filename_type, thumbnail
|
| 40 |
from api.utils.web_utils import html2pdf, is_valid_url
|
| 41 |
from api.constants import IMG_BASE64_PREFIX
|
| 42 |
|
|
@@ -529,15 +530,25 @@ def parse():
|
|
| 529 |
if not is_valid_url(url):
|
| 530 |
return get_json_result(
|
| 531 |
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
|
|
|
|
|
|
| 532 |
from selenium.webdriver import Chrome, ChromeOptions
|
| 533 |
options = ChromeOptions()
|
| 534 |
options.add_argument('--headless')
|
| 535 |
options.add_argument('--disable-gpu')
|
| 536 |
options.add_argument('--no-sandbox')
|
| 537 |
options.add_argument('--disable-dev-shm-usage')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
driver = Chrome(options=options)
|
| 539 |
driver.get(url)
|
|
|
|
| 540 |
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
|
|
|
| 541 |
return get_json_result(data="\n".join(sections))
|
| 542 |
|
| 543 |
if 'file' not in request.files:
|
|
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License
|
| 15 |
#
|
| 16 |
+
import os.path
|
| 17 |
import pathlib
|
| 18 |
import re
|
| 19 |
|
|
|
|
| 37 |
from api.settings import RetCode, docStoreConn
|
| 38 |
from api.utils.api_utils import get_json_result
|
| 39 |
from rag.utils.storage_factory import STORAGE_IMPL
|
| 40 |
+
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
| 41 |
from api.utils.web_utils import html2pdf, is_valid_url
|
| 42 |
from api.constants import IMG_BASE64_PREFIX
|
| 43 |
|
|
|
|
| 530 |
if not is_valid_url(url):
|
| 531 |
return get_json_result(
|
| 532 |
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
| 533 |
+
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
| 534 |
+
os.makedirs(download_path, exist_ok=True)
|
| 535 |
from selenium.webdriver import Chrome, ChromeOptions
|
| 536 |
options = ChromeOptions()
|
| 537 |
options.add_argument('--headless')
|
| 538 |
options.add_argument('--disable-gpu')
|
| 539 |
options.add_argument('--no-sandbox')
|
| 540 |
options.add_argument('--disable-dev-shm-usage')
|
| 541 |
+
options.add_experimental_option('prefs', {
|
| 542 |
+
'download.default_directory': download_path,
|
| 543 |
+
'download.prompt_for_download': False,
|
| 544 |
+
'download.directory_upgrade': True,
|
| 545 |
+
'safebrowsing.enabled': True
|
| 546 |
+
})
|
| 547 |
driver = Chrome(options=options)
|
| 548 |
driver.get(url)
|
| 549 |
+
print(driver.get_downloadable_files())
|
| 550 |
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
| 551 |
+
driver.close()
|
| 552 |
return get_json_result(data="\n".join(sections))
|
| 553 |
|
| 554 |
if 'file' not in request.files:
|
rag/nlp/query.py
CHANGED
|
@@ -66,7 +66,7 @@ class FulltextQueryer:
|
|
| 66 |
|
| 67 |
def question(self, txt, tbl="qa", min_match:float=0.6):
|
| 68 |
txt = re.sub(
|
| 69 |
-
r"[ :\r\n\t,,。??/`!!&\^%%()
|
| 70 |
" ",
|
| 71 |
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
| 72 |
).strip()
|
|
|
|
| 66 |
|
| 67 |
def question(self, txt, tbl="qa", min_match:float=0.6):
|
| 68 |
txt = re.sub(
|
| 69 |
+
r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+",
|
| 70 |
" ",
|
| 71 |
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
| 72 |
).strip()
|
rag/nlp/term_weight.py
CHANGED
|
@@ -228,6 +228,7 @@ class Dealer:
|
|
| 228 |
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
| 229 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 230 |
np.array([ner(t) * postag(t) for t in tks])
|
|
|
|
| 231 |
tw = list(zip(tks, wts))
|
| 232 |
else:
|
| 233 |
for tk in tks:
|
|
@@ -236,6 +237,7 @@ class Dealer:
|
|
| 236 |
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
| 237 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 238 |
np.array([ner(t) * postag(t) for t in tt])
|
|
|
|
| 239 |
tw.extend(zip(tt, wts))
|
| 240 |
|
| 241 |
S = np.sum([s for _, s in tw])
|
|
|
|
| 228 |
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
| 229 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 230 |
np.array([ner(t) * postag(t) for t in tks])
|
| 231 |
+
wts = [math.exp(s) for s in wts]
|
| 232 |
tw = list(zip(tks, wts))
|
| 233 |
else:
|
| 234 |
for tk in tks:
|
|
|
|
| 237 |
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
| 238 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
| 239 |
np.array([ner(t) * postag(t) for t in tt])
|
| 240 |
+
wts = [math.exp(s) for s in wts]
|
| 241 |
tw.extend(zip(tt, wts))
|
| 242 |
|
| 243 |
S = np.sum([s for _, s in tw])
|