KevinHuSh
commited on
Commit
·
8f39e7a
1
Parent(s):
328b4c9
support snapshot download from local (#153)
Browse files* support snapshot download from local
* let snapshot download from local
- README.md +9 -3
- api/apps/conversation_app.py +6 -1
- deepdoc/parser/pdf_parser.py +15 -3
- deepdoc/vision/layout_recognizer.py +10 -1
- deepdoc/vision/ocr.py +31 -9
- deepdoc/vision/recognizer.py +10 -1
- deepdoc/vision/table_structure_recognizer.py +10 -1
- docker/README.md +1 -1
- docker/entrypoint.sh +1 -1
- docker/service_conf.yaml +1 -1
- rag/llm/embedding_model.py +14 -1
- rag/svr/task_executor.py +1 -1
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
<div align="center">
|
2 |
-
<a href="https://ragflow.io/">
|
3 |
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
|
4 |
</a>
|
5 |
</div>
|
@@ -11,7 +11,7 @@
|
|
11 |
</p>
|
12 |
|
13 |
<p align="center">
|
14 |
-
<a href="https://ragflow.io" target="_blank">
|
15 |
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
16 |
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
17 |
<img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
|
@@ -21,7 +21,7 @@
|
|
21 |
</a>
|
22 |
</p>
|
23 |
|
24 |
-
[RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
|
25 |
with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
|
26 |
platform to empower your business with AI.
|
27 |
|
@@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again!
|
|
119 |
> The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
|
120 |
> and change the left part of *'80:80'*'.
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
# Configuration
|
123 |
If you need to change the default setting of the system when you deploy it. There several ways to configure it.
|
124 |
Please refer to [README](./docker/README.md) and manually set the configuration.
|
|
|
1 |
<div align="center">
|
2 |
+
<a href="https://demo.ragflow.io/">
|
3 |
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
|
4 |
</a>
|
5 |
</div>
|
|
|
11 |
</p>
|
12 |
|
13 |
<p align="center">
|
14 |
+
<a href="https://demo.ragflow.io" target="_blank">
|
15 |
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
16 |
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
17 |
<img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
|
|
|
21 |
</a>
|
22 |
</p>
|
23 |
|
24 |
+
[RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
|
25 |
with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
|
26 |
platform to empower your business with AI.
|
27 |
|
|
|
119 |
> The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
|
120 |
> and change the left part of *'80:80'*'.
|
121 |
|
122 |
+
# System Architecture Diagram
|
123 |
+
|
124 |
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
125 |
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/39c8e546-51ca-4b50-a1da-83731b540cd0" width="1000"/>
|
126 |
+
</div>
|
127 |
+
|
128 |
# Configuration
|
129 |
If you need to change the default setting of the system when you deploy it. There several ways to configure it.
|
130 |
Please refer to [README](./docker/README.md) and manually set the configuration.
|
api/apps/conversation_app.py
CHANGED
@@ -320,8 +320,13 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
|
|
320 |
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
321 |
docid_idx = list(docid_idx)[0]
|
322 |
docnm_idx = list(docnm_idx)[0]
|
|
|
|
|
|
|
|
|
|
|
323 |
return {
|
324 |
"answer": "\n".join([clmns, line, rows]),
|
325 |
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
|
326 |
-
"doc_aggs":
|
327 |
}
|
|
|
320 |
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
321 |
docid_idx = list(docid_idx)[0]
|
322 |
docnm_idx = list(docnm_idx)[0]
|
323 |
+
doc_aggs = {}
|
324 |
+
for r in tbl["rows"]:
|
325 |
+
if r[docid_idx] not in doc_aggs:
|
326 |
+
doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0}
|
327 |
+
doc_aggs[r[docid_idx]]["count"] += 1
|
328 |
return {
|
329 |
"answer": "\n".join([clmns, line, rows]),
|
330 |
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
|
331 |
+
"doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]}
|
332 |
}
|
deepdoc/parser/pdf_parser.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
|
|
2 |
import random
|
3 |
|
4 |
import fitz
|
@@ -12,10 +13,12 @@ from PIL import Image, ImageDraw
|
|
12 |
import numpy as np
|
13 |
|
14 |
from PyPDF2 import PdfReader as pdf2_read
|
|
|
|
|
15 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
16 |
from rag.nlp import huqie
|
17 |
from copy import deepcopy
|
18 |
-
from huggingface_hub import hf_hub_download
|
19 |
|
20 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
21 |
|
@@ -32,8 +35,17 @@ class HuParser:
|
|
32 |
self.updown_cnt_mdl = xgb.Booster()
|
33 |
if torch.cuda.is_available():
|
34 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
self.page_from = 0
|
38 |
"""
|
39 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
import os
|
3 |
import random
|
4 |
|
5 |
import fitz
|
|
|
13 |
import numpy as np
|
14 |
|
15 |
from PyPDF2 import PdfReader as pdf2_read
|
16 |
+
|
17 |
+
from api.utils.file_utils import get_project_base_directory
|
18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
19 |
from rag.nlp import huqie
|
20 |
from copy import deepcopy
|
21 |
+
from huggingface_hub import hf_hub_download, snapshot_download
|
22 |
|
23 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
24 |
|
|
|
35 |
self.updown_cnt_mdl = xgb.Booster()
|
36 |
if torch.cuda.is_available():
|
37 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
38 |
+
try:
|
39 |
+
model_dir = snapshot_download(
|
40 |
+
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
41 |
+
local_dir=os.path.join(
|
42 |
+
get_project_base_directory(),
|
43 |
+
"rag/res/deepdoc"),
|
44 |
+
local_files_only=True)
|
45 |
+
except Exception as e:
|
46 |
+
model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")
|
47 |
+
|
48 |
+
self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
|
49 |
self.page_from = 0
|
50 |
"""
|
51 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
deepdoc/vision/layout_recognizer.py
CHANGED
@@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer):
|
|
37 |
"Equation",
|
38 |
]
|
39 |
def __init__(self, domain):
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
42 |
self.garbage_layouts = ["footer", "header", "reference"]
|
43 |
|
|
|
37 |
"Equation",
|
38 |
]
|
39 |
def __init__(self, domain):
|
40 |
+
try:
|
41 |
+
model_dir = snapshot_download(
|
42 |
+
repo_id="InfiniFlow/deepdoc",
|
43 |
+
local_dir=os.path.join(
|
44 |
+
get_project_base_directory(),
|
45 |
+
"rag/res/deepdoc"),
|
46 |
+
local_files_only=True)
|
47 |
+
except Exception as e:
|
48 |
+
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
49 |
+
|
50 |
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
51 |
self.garbage_layouts = ["footer", "header", "reference"]
|
52 |
|
deepdoc/vision/ocr.py
CHANGED
@@ -14,6 +14,10 @@
|
|
14 |
import copy
|
15 |
import time
|
16 |
import os
|
|
|
|
|
|
|
|
|
17 |
from .operators import *
|
18 |
import numpy as np
|
19 |
import onnxruntime as ort
|
@@ -21,6 +25,7 @@ import onnxruntime as ort
|
|
21 |
from .postprocess import build_post_process
|
22 |
from rag.settings import cron_logger
|
23 |
|
|
|
24 |
def transform(data, ops=None):
|
25 |
""" transform """
|
26 |
if ops is None:
|
@@ -66,9 +71,15 @@ def load_model(model_dir, nm):
|
|
66 |
options.intra_op_num_threads = 2
|
67 |
options.inter_op_num_threads = 2
|
68 |
if False and ort.get_device() == "GPU":
|
69 |
-
sess = ort.InferenceSession(
|
|
|
|
|
|
|
70 |
else:
|
71 |
-
sess = ort.InferenceSession(
|
|
|
|
|
|
|
72 |
return sess, sess.get_inputs()[0]
|
73 |
|
74 |
|
@@ -331,7 +342,8 @@ class TextRecognizer(object):
|
|
331 |
outputs = self.predictor.run(None, input_dict)
|
332 |
break
|
333 |
except Exception as e:
|
334 |
-
if i >= 3:
|
|
|
335 |
time.sleep(5)
|
336 |
preds = outputs[0]
|
337 |
rec_result = self.postprocess_op(preds)
|
@@ -442,7 +454,8 @@ class TextDetector(object):
|
|
442 |
outputs = self.predictor.run(None, input_dict)
|
443 |
break
|
444 |
except Exception as e:
|
445 |
-
if i >= 3:
|
|
|
446 |
time.sleep(5)
|
447 |
|
448 |
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
|
@@ -466,7 +479,15 @@ class OCR(object):
|
|
466 |
|
467 |
"""
|
468 |
if not model_dir:
|
469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
|
471 |
self.text_detector = TextDetector(model_dir)
|
472 |
self.text_recognizer = TextRecognizer(model_dir)
|
@@ -548,14 +569,16 @@ class OCR(object):
|
|
548 |
cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
|
549 |
len(dt_boxes), elapse))
|
550 |
|
551 |
-
return zip(self.sorted_boxes(dt_boxes), [
|
|
|
552 |
|
553 |
def recognize(self, ori_im, box):
|
554 |
img_crop = self.get_rotate_crop_image(ori_im, box)
|
555 |
|
556 |
rec_res, elapse = self.text_recognizer([img_crop])
|
557 |
text, score = rec_res[0]
|
558 |
-
if score < self.drop_score:
|
|
|
559 |
return text
|
560 |
|
561 |
def __call__(self, img, cls=True):
|
@@ -600,8 +623,7 @@ class OCR(object):
|
|
600 |
end = time.time()
|
601 |
time_dict['all'] = end - start
|
602 |
|
603 |
-
|
604 |
-
#for bno in range(len(img_crop_list)):
|
605 |
# print(f"{bno}, {rec_res[bno]}")
|
606 |
|
607 |
return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
|
|
|
14 |
import copy
|
15 |
import time
|
16 |
import os
|
17 |
+
|
18 |
+
from huggingface_hub import snapshot_download
|
19 |
+
|
20 |
+
from api.utils.file_utils import get_project_base_directory
|
21 |
from .operators import *
|
22 |
import numpy as np
|
23 |
import onnxruntime as ort
|
|
|
25 |
from .postprocess import build_post_process
|
26 |
from rag.settings import cron_logger
|
27 |
|
28 |
+
|
29 |
def transform(data, ops=None):
|
30 |
""" transform """
|
31 |
if ops is None:
|
|
|
71 |
options.intra_op_num_threads = 2
|
72 |
options.inter_op_num_threads = 2
|
73 |
if False and ort.get_device() == "GPU":
|
74 |
+
sess = ort.InferenceSession(
|
75 |
+
model_file_path,
|
76 |
+
options=options,
|
77 |
+
providers=['CUDAExecutionProvider'])
|
78 |
else:
|
79 |
+
sess = ort.InferenceSession(
|
80 |
+
model_file_path,
|
81 |
+
options=options,
|
82 |
+
providers=['CPUExecutionProvider'])
|
83 |
return sess, sess.get_inputs()[0]
|
84 |
|
85 |
|
|
|
342 |
outputs = self.predictor.run(None, input_dict)
|
343 |
break
|
344 |
except Exception as e:
|
345 |
+
if i >= 3:
|
346 |
+
raise e
|
347 |
time.sleep(5)
|
348 |
preds = outputs[0]
|
349 |
rec_result = self.postprocess_op(preds)
|
|
|
454 |
outputs = self.predictor.run(None, input_dict)
|
455 |
break
|
456 |
except Exception as e:
|
457 |
+
if i >= 3:
|
458 |
+
raise e
|
459 |
time.sleep(5)
|
460 |
|
461 |
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
|
|
|
479 |
|
480 |
"""
|
481 |
if not model_dir:
|
482 |
+
try:
|
483 |
+
model_dir = snapshot_download(
|
484 |
+
repo_id="InfiniFlow/deepdoc",
|
485 |
+
local_dir=os.path.join(
|
486 |
+
get_project_base_directory(),
|
487 |
+
"rag/res/deepdoc"),
|
488 |
+
local_files_only=True)
|
489 |
+
except Exception as e:
|
490 |
+
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
491 |
|
492 |
self.text_detector = TextDetector(model_dir)
|
493 |
self.text_recognizer = TextRecognizer(model_dir)
|
|
|
569 |
cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
|
570 |
len(dt_boxes), elapse))
|
571 |
|
572 |
+
return zip(self.sorted_boxes(dt_boxes), [
|
573 |
+
("", 0) for _ in range(len(dt_boxes))])
|
574 |
|
575 |
def recognize(self, ori_im, box):
|
576 |
img_crop = self.get_rotate_crop_image(ori_im, box)
|
577 |
|
578 |
rec_res, elapse = self.text_recognizer([img_crop])
|
579 |
text, score = rec_res[0]
|
580 |
+
if score < self.drop_score:
|
581 |
+
return ""
|
582 |
return text
|
583 |
|
584 |
def __call__(self, img, cls=True):
|
|
|
623 |
end = time.time()
|
624 |
time_dict['all'] = end - start
|
625 |
|
626 |
+
# for bno in range(len(img_crop_list)):
|
|
|
627 |
# print(f"{bno}, {rec_res[bno]}")
|
628 |
|
629 |
return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
|
deepdoc/vision/recognizer.py
CHANGED
@@ -17,6 +17,7 @@ from copy import deepcopy
|
|
17 |
import onnxruntime as ort
|
18 |
from huggingface_hub import snapshot_download
|
19 |
|
|
|
20 |
from .operators import *
|
21 |
from rag.settings import cron_logger
|
22 |
|
@@ -35,7 +36,15 @@ class Recognizer(object):
|
|
35 |
|
36 |
"""
|
37 |
if not model_dir:
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
41 |
if not os.path.exists(model_file_path):
|
|
|
17 |
import onnxruntime as ort
|
18 |
from huggingface_hub import snapshot_download
|
19 |
|
20 |
+
from api.utils.file_utils import get_project_base_directory
|
21 |
from .operators import *
|
22 |
from rag.settings import cron_logger
|
23 |
|
|
|
36 |
|
37 |
"""
|
38 |
if not model_dir:
|
39 |
+
try:
|
40 |
+
model_dir = snapshot_download(
|
41 |
+
repo_id="InfiniFlow/deepdoc",
|
42 |
+
local_dir=os.path.join(
|
43 |
+
get_project_base_directory(),
|
44 |
+
"rag/res/deepdoc"),
|
45 |
+
local_files_only=True)
|
46 |
+
except Exception as e:
|
47 |
+
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
48 |
|
49 |
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
50 |
if not os.path.exists(model_file_path):
|
deepdoc/vision/table_structure_recognizer.py
CHANGED
@@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer):
|
|
34 |
]
|
35 |
|
36 |
def __init__(self):
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
39 |
|
40 |
def __call__(self, images, thr=0.2):
|
|
|
34 |
]
|
35 |
|
36 |
def __init__(self):
|
37 |
+
try:
|
38 |
+
model_dir = snapshot_download(
|
39 |
+
repo_id="InfiniFlow/deepdoc",
|
40 |
+
local_dir=os.path.join(
|
41 |
+
get_project_base_directory(),
|
42 |
+
"rag/res/deepdoc"),
|
43 |
+
local_files_only=True)
|
44 |
+
except Exception as e:
|
45 |
+
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
46 |
+
|
47 |
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
48 |
|
49 |
def __call__(self, images, thr=0.2):
|
docker/README.md
CHANGED
@@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until
|
|
67 |
Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
|
68 |
|
69 |
### factory
|
70 |
-
The LLM suppliers. '
|
71 |
|
72 |
### api_key
|
73 |
The corresponding API key of your assigned LLM vendor.
|
|
|
67 |
Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
|
68 |
|
69 |
### factory
|
70 |
+
The LLM suppliers. 'Tongyi-Qianwen', "OpenAI", "Moonshot" and "ZHIPU-AI" are supported.
|
71 |
|
72 |
### api_key
|
73 |
The corresponding API key of your assigned LLM vendor.
|
docker/entrypoint.sh
CHANGED
@@ -29,7 +29,7 @@ function task_bro(){
|
|
29 |
|
30 |
task_bro &
|
31 |
|
32 |
-
WS=
|
33 |
for ((i=0;i<WS;i++))
|
34 |
do
|
35 |
task_exe $i $WS &
|
|
|
29 |
|
30 |
task_bro &
|
31 |
|
32 |
+
WS=2
|
33 |
for ((i=0;i<WS;i++))
|
34 |
do
|
35 |
task_exe $i $WS &
|
docker/service_conf.yaml
CHANGED
@@ -16,7 +16,7 @@ minio:
|
|
16 |
es:
|
17 |
hosts: 'http://es01:9200'
|
18 |
user_default_llm:
|
19 |
-
factory: '
|
20 |
api_key: 'sk-xxxxxxxxxxxxx'
|
21 |
oauth:
|
22 |
github:
|
|
|
16 |
es:
|
17 |
hosts: 'http://es01:9200'
|
18 |
user_default_llm:
|
19 |
+
factory: 'Tongyi-Qianwen'
|
20 |
api_key: 'sk-xxxxxxxxxxxxx'
|
21 |
oauth:
|
22 |
github:
|
rag/llm/embedding_model.py
CHANGED
@@ -13,6 +13,7 @@
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
|
|
16 |
from abc import ABC
|
17 |
|
18 |
import dashscope
|
@@ -21,9 +22,21 @@ from FlagEmbedding import FlagModel
|
|
21 |
import torch
|
22 |
import numpy as np
|
23 |
from huggingface_hub import snapshot_download
|
|
|
|
|
24 |
from rag.utils import num_tokens_from_string
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
28 |
use_fp16=torch.cuda.is_available())
|
29 |
|
|
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
+
import os
|
17 |
from abc import ABC
|
18 |
|
19 |
import dashscope
|
|
|
22 |
import torch
|
23 |
import numpy as np
|
24 |
from huggingface_hub import snapshot_download
|
25 |
+
|
26 |
+
from api.utils.file_utils import get_project_base_directory
|
27 |
from rag.utils import num_tokens_from_string
|
28 |
|
29 |
+
try:
|
30 |
+
model_dir = snapshot_download(
|
31 |
+
repo_id="BAAI/bge-large-zh-v1.5",
|
32 |
+
local_dir=os.path.join(
|
33 |
+
get_project_base_directory(),
|
34 |
+
"rag/res/bge-large-zh-v1.5"),
|
35 |
+
local_files_only=True)
|
36 |
+
except Exception as e:
|
37 |
+
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")
|
38 |
+
|
39 |
+
flag_model = FlagModel(model_dir,
|
40 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
41 |
use_fp16=torch.cuda.is_available())
|
42 |
|
rag/svr/task_executor.py
CHANGED
@@ -172,7 +172,7 @@ def init_kb(row):
|
|
172 |
def embedding(docs, mdl, parser_config={}, callback=None):
|
173 |
batch_size = 32
|
174 |
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
175 |
-
d["content_with_weight"] for d in docs]
|
176 |
tk_count = 0
|
177 |
if len(tts) == len(cnts):
|
178 |
tts_ = np.array([])
|
|
|
172 |
def embedding(docs, mdl, parser_config={}, callback=None):
|
173 |
batch_size = 32
|
174 |
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
175 |
+
re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
|
176 |
tk_count = 0
|
177 |
if len(tts) == len(cnts):
|
178 |
tts_ = np.array([])
|