KevinHuSh
commited on
Commit
·
8ee4f9f
1
Parent(s):
3a31a8a
let's load model from local (#163)
Browse files
deepdoc/parser/pdf_parser.py
CHANGED
@@ -18,7 +18,7 @@ from api.utils.file_utils import get_project_base_directory
|
|
18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
19 |
from rag.nlp import huqie
|
20 |
from copy import deepcopy
|
21 |
-
from huggingface_hub import
|
22 |
|
23 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
24 |
|
@@ -36,18 +36,18 @@ class HuParser:
|
|
36 |
if torch.cuda.is_available():
|
37 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
38 |
try:
|
39 |
-
model_dir =
|
40 |
-
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
41 |
-
local_dir=os.path.join(
|
42 |
get_project_base_directory(),
|
43 |
-
"rag/res/deepdoc")
|
44 |
-
|
|
|
45 |
except Exception as e:
|
46 |
model_dir = snapshot_download(
|
47 |
repo_id="InfiniFlow/text_concat_xgb_v1.0")
|
|
|
|
|
|
|
48 |
|
49 |
-
self.updown_cnt_mdl.load_model(os.path.join(
|
50 |
-
model_dir, "updown_concat_xgb.model"))
|
51 |
self.page_from = 0
|
52 |
"""
|
53 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
|
|
18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
19 |
from rag.nlp import huqie
|
20 |
from copy import deepcopy
|
21 |
+
from huggingface_hub import snapshot_download
|
22 |
|
23 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
24 |
|
|
|
36 |
if torch.cuda.is_available():
|
37 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
38 |
try:
|
39 |
+
model_dir = os.path.join(
|
|
|
|
|
40 |
get_project_base_directory(),
|
41 |
+
"rag/res/deepdoc")
|
42 |
+
self.updown_cnt_mdl.load_model(os.path.join(
|
43 |
+
model_dir, "updown_concat_xgb.model"))
|
44 |
except Exception as e:
|
45 |
model_dir = snapshot_download(
|
46 |
repo_id="InfiniFlow/text_concat_xgb_v1.0")
|
47 |
+
self.updown_cnt_mdl.load_model(os.path.join(
|
48 |
+
model_dir, "updown_concat_xgb.model"))
|
49 |
+
|
50 |
|
|
|
|
|
51 |
self.page_from = 0
|
52 |
"""
|
53 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
deepdoc/vision/layout_recognizer.py
CHANGED
@@ -17,7 +17,6 @@ from copy import deepcopy
|
|
17 |
import numpy as np
|
18 |
from huggingface_hub import snapshot_download
|
19 |
|
20 |
-
from api.db import ParserType
|
21 |
from api.utils.file_utils import get_project_base_directory
|
22 |
from deepdoc.vision import Recognizer
|
23 |
|
@@ -39,17 +38,14 @@ class LayoutRecognizer(Recognizer):
|
|
39 |
|
40 |
def __init__(self, domain):
|
41 |
try:
|
42 |
-
model_dir =
|
43 |
-
repo_id="InfiniFlow/deepdoc",
|
44 |
-
local_dir=os.path.join(
|
45 |
get_project_base_directory(),
|
46 |
-
"rag/res/deepdoc")
|
47 |
-
|
48 |
except Exception as e:
|
49 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
|
|
50 |
|
51 |
-
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
52 |
-
super().__init__(self.labels, domain, model_dir)
|
53 |
self.garbage_layouts = ["footer", "header", "reference"]
|
54 |
|
55 |
def __call__(self, image_list, ocr_res, scale_factor=3,
|
|
|
17 |
import numpy as np
|
18 |
from huggingface_hub import snapshot_download
|
19 |
|
|
|
20 |
from api.utils.file_utils import get_project_base_directory
|
21 |
from deepdoc.vision import Recognizer
|
22 |
|
|
|
38 |
|
39 |
def __init__(self, domain):
|
40 |
try:
|
41 |
+
model_dir = os.path.join(
|
|
|
|
|
42 |
get_project_base_directory(),
|
43 |
+
"rag/res/deepdoc")
|
44 |
+
super().__init__(self.labels, domain, model_dir)
|
45 |
except Exception as e:
|
46 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
47 |
+
super().__init__(self.labels, domain, model_dir)
|
48 |
|
|
|
|
|
49 |
self.garbage_layouts = ["footer", "header", "reference"]
|
50 |
|
51 |
def __call__(self, image_list, ocr_res, scale_factor=3,
|
deepdoc/vision/ocr.py
CHANGED
@@ -480,17 +480,16 @@ class OCR(object):
|
|
480 |
"""
|
481 |
if not model_dir:
|
482 |
try:
|
483 |
-
model_dir =
|
484 |
-
repo_id="InfiniFlow/deepdoc",
|
485 |
-
local_dir=os.path.join(
|
486 |
get_project_base_directory(),
|
487 |
-
"rag/res/deepdoc")
|
488 |
-
|
|
|
489 |
except Exception as e:
|
490 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
|
|
|
|
491 |
|
492 |
-
self.text_detector = TextDetector(model_dir)
|
493 |
-
self.text_recognizer = TextRecognizer(model_dir)
|
494 |
self.drop_score = 0.5
|
495 |
self.crop_image_res_index = 0
|
496 |
|
|
|
480 |
"""
|
481 |
if not model_dir:
|
482 |
try:
|
483 |
+
model_dir = os.path.join(
|
|
|
|
|
484 |
get_project_base_directory(),
|
485 |
+
"rag/res/deepdoc")
|
486 |
+
self.text_detector = TextDetector(model_dir)
|
487 |
+
self.text_recognizer = TextRecognizer(model_dir)
|
488 |
except Exception as e:
|
489 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
490 |
+
self.text_detector = TextDetector(model_dir)
|
491 |
+
self.text_recognizer = TextRecognizer(model_dir)
|
492 |
|
|
|
|
|
493 |
self.drop_score = 0.5
|
494 |
self.crop_image_res_index = 0
|
495 |
|
deepdoc/vision/recognizer.py
CHANGED
@@ -36,17 +36,14 @@ class Recognizer(object):
|
|
36 |
|
37 |
"""
|
38 |
if not model_dir:
|
39 |
-
|
40 |
-
model_dir = snapshot_download(
|
41 |
-
repo_id="InfiniFlow/deepdoc",
|
42 |
-
local_dir=os.path.join(
|
43 |
get_project_base_directory(),
|
44 |
-
"rag/res/deepdoc")
|
45 |
-
|
46 |
-
|
47 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
|
|
48 |
|
49 |
-
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
50 |
if not os.path.exists(model_file_path):
|
51 |
raise ValueError("not find model file path {}".format(
|
52 |
model_file_path))
|
|
|
36 |
|
37 |
"""
|
38 |
if not model_dir:
|
39 |
+
model_dir = os.path.join(
|
|
|
|
|
|
|
40 |
get_project_base_directory(),
|
41 |
+
"rag/res/deepdoc")
|
42 |
+
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
43 |
+
if not os.path.exists(model_file_path):
|
44 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
45 |
+
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
46 |
|
|
|
47 |
if not os.path.exists(model_file_path):
|
48 |
raise ValueError("not find model file path {}".format(
|
49 |
model_file_path))
|
deepdoc/vision/table_structure_recognizer.py
CHANGED
@@ -35,17 +35,11 @@ class TableStructureRecognizer(Recognizer):
|
|
35 |
|
36 |
def __init__(self):
|
37 |
try:
|
38 |
-
|
39 |
-
repo_id="InfiniFlow/deepdoc",
|
40 |
-
local_dir=os.path.join(
|
41 |
get_project_base_directory(),
|
42 |
-
"rag/res/deepdoc")
|
43 |
-
local_files_only=True)
|
44 |
except Exception as e:
|
45 |
-
|
46 |
-
|
47 |
-
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
48 |
-
super().__init__(self.labels, "tsr", model_dir)
|
49 |
|
50 |
def __call__(self, images, thr=0.2):
|
51 |
tbls = super().__call__(images, thr)
|
|
|
35 |
|
36 |
def __init__(self):
|
37 |
try:
|
38 |
+
super().__init__(self.labels, "tsr", os.path.join(
|
|
|
|
|
39 |
get_project_base_directory(),
|
40 |
+
"rag/res/deepdoc"))
|
|
|
41 |
except Exception as e:
|
42 |
+
super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc"))
|
|
|
|
|
|
|
43 |
|
44 |
def __call__(self, images, thr=0.2):
|
45 |
tbls = super().__call__(images, thr)
|
rag/llm/embedding_model.py
CHANGED
@@ -28,16 +28,13 @@ from api.utils.file_utils import get_project_base_directory
|
|
28 |
from rag.utils import num_tokens_from_string
|
29 |
|
30 |
try:
|
31 |
-
|
32 |
-
repo_id="BAAI/bge-large-zh-v1.5",
|
33 |
-
local_dir=os.path.join(
|
34 |
get_project_base_directory(),
|
35 |
"rag/res/bge-large-zh-v1.5"),
|
36 |
-
|
|
|
37 |
except Exception as e:
|
38 |
-
|
39 |
-
|
40 |
-
flag_model = FlagModel(model_dir,
|
41 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
42 |
use_fp16=torch.cuda.is_available())
|
43 |
|
|
|
28 |
from rag.utils import num_tokens_from_string
|
29 |
|
30 |
try:
|
31 |
+
flag_model = FlagModel(os.path.join(
|
|
|
|
|
32 |
get_project_base_directory(),
|
33 |
"rag/res/bge-large-zh-v1.5"),
|
34 |
+
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
35 |
+
use_fp16=torch.cuda.is_available())
|
36 |
except Exception as e:
|
37 |
+
flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
|
|
|
|
|
38 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
39 |
use_fp16=torch.cuda.is_available())
|
40 |
|
rag/nlp/search.py
CHANGED
@@ -247,7 +247,7 @@ class Dealer:
|
|
247 |
for ck in chunks]
|
248 |
cites = {}
|
249 |
thr = 0.63
|
250 |
-
while len(cites.keys()) == 0 and pieces_ and chunks_tks:
|
251 |
for i, a in enumerate(pieces_):
|
252 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
253 |
chunk_v,
|
|
|
247 |
for ck in chunks]
|
248 |
cites = {}
|
249 |
thr = 0.63
|
250 |
+
while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
|
251 |
for i, a in enumerate(pieces_):
|
252 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
253 |
chunk_v,
|