KevinHuSh
commited on
Commit
·
8ee4f9f
1
Parent(s):
3a31a8a
let's load model from local (#163)
Browse files
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -18,7 +18,7 @@ from api.utils.file_utils import get_project_base_directory
|
|
| 18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
| 19 |
from rag.nlp import huqie
|
| 20 |
from copy import deepcopy
|
| 21 |
-
from huggingface_hub import
|
| 22 |
|
| 23 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
| 24 |
|
|
@@ -36,18 +36,18 @@ class HuParser:
|
|
| 36 |
if torch.cuda.is_available():
|
| 37 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
| 38 |
try:
|
| 39 |
-
model_dir =
|
| 40 |
-
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
| 41 |
-
local_dir=os.path.join(
|
| 42 |
get_project_base_directory(),
|
| 43 |
-
"rag/res/deepdoc")
|
| 44 |
-
|
|
|
|
| 45 |
except Exception as e:
|
| 46 |
model_dir = snapshot_download(
|
| 47 |
repo_id="InfiniFlow/text_concat_xgb_v1.0")
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
self.updown_cnt_mdl.load_model(os.path.join(
|
| 50 |
-
model_dir, "updown_concat_xgb.model"))
|
| 51 |
self.page_from = 0
|
| 52 |
"""
|
| 53 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
|
|
|
| 18 |
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
| 19 |
from rag.nlp import huqie
|
| 20 |
from copy import deepcopy
|
| 21 |
+
from huggingface_hub import snapshot_download
|
| 22 |
|
| 23 |
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
| 24 |
|
|
|
|
| 36 |
if torch.cuda.is_available():
|
| 37 |
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
| 38 |
try:
|
| 39 |
+
model_dir = os.path.join(
|
|
|
|
|
|
|
| 40 |
get_project_base_directory(),
|
| 41 |
+
"rag/res/deepdoc")
|
| 42 |
+
self.updown_cnt_mdl.load_model(os.path.join(
|
| 43 |
+
model_dir, "updown_concat_xgb.model"))
|
| 44 |
except Exception as e:
|
| 45 |
model_dir = snapshot_download(
|
| 46 |
repo_id="InfiniFlow/text_concat_xgb_v1.0")
|
| 47 |
+
self.updown_cnt_mdl.load_model(os.path.join(
|
| 48 |
+
model_dir, "updown_concat_xgb.model"))
|
| 49 |
+
|
| 50 |
|
|
|
|
|
|
|
| 51 |
self.page_from = 0
|
| 52 |
"""
|
| 53 |
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
deepdoc/vision/layout_recognizer.py
CHANGED
|
@@ -17,7 +17,6 @@ from copy import deepcopy
|
|
| 17 |
import numpy as np
|
| 18 |
from huggingface_hub import snapshot_download
|
| 19 |
|
| 20 |
-
from api.db import ParserType
|
| 21 |
from api.utils.file_utils import get_project_base_directory
|
| 22 |
from deepdoc.vision import Recognizer
|
| 23 |
|
|
@@ -39,17 +38,14 @@ class LayoutRecognizer(Recognizer):
|
|
| 39 |
|
| 40 |
def __init__(self, domain):
|
| 41 |
try:
|
| 42 |
-
model_dir =
|
| 43 |
-
repo_id="InfiniFlow/deepdoc",
|
| 44 |
-
local_dir=os.path.join(
|
| 45 |
get_project_base_directory(),
|
| 46 |
-
"rag/res/deepdoc")
|
| 47 |
-
|
| 48 |
except Exception as e:
|
| 49 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
|
|
|
| 50 |
|
| 51 |
-
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
| 52 |
-
super().__init__(self.labels, domain, model_dir)
|
| 53 |
self.garbage_layouts = ["footer", "header", "reference"]
|
| 54 |
|
| 55 |
def __call__(self, image_list, ocr_res, scale_factor=3,
|
|
|
|
| 17 |
import numpy as np
|
| 18 |
from huggingface_hub import snapshot_download
|
| 19 |
|
|
|
|
| 20 |
from api.utils.file_utils import get_project_base_directory
|
| 21 |
from deepdoc.vision import Recognizer
|
| 22 |
|
|
|
|
| 38 |
|
| 39 |
def __init__(self, domain):
|
| 40 |
try:
|
| 41 |
+
model_dir = os.path.join(
|
|
|
|
|
|
|
| 42 |
get_project_base_directory(),
|
| 43 |
+
"rag/res/deepdoc")
|
| 44 |
+
super().__init__(self.labels, domain, model_dir)
|
| 45 |
except Exception as e:
|
| 46 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
| 47 |
+
super().__init__(self.labels, domain, model_dir)
|
| 48 |
|
|
|
|
|
|
|
| 49 |
self.garbage_layouts = ["footer", "header", "reference"]
|
| 50 |
|
| 51 |
def __call__(self, image_list, ocr_res, scale_factor=3,
|
deepdoc/vision/ocr.py
CHANGED
|
@@ -480,17 +480,16 @@ class OCR(object):
|
|
| 480 |
"""
|
| 481 |
if not model_dir:
|
| 482 |
try:
|
| 483 |
-
model_dir =
|
| 484 |
-
repo_id="InfiniFlow/deepdoc",
|
| 485 |
-
local_dir=os.path.join(
|
| 486 |
get_project_base_directory(),
|
| 487 |
-
"rag/res/deepdoc")
|
| 488 |
-
|
|
|
|
| 489 |
except Exception as e:
|
| 490 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
|
|
|
|
|
|
| 491 |
|
| 492 |
-
self.text_detector = TextDetector(model_dir)
|
| 493 |
-
self.text_recognizer = TextRecognizer(model_dir)
|
| 494 |
self.drop_score = 0.5
|
| 495 |
self.crop_image_res_index = 0
|
| 496 |
|
|
|
|
| 480 |
"""
|
| 481 |
if not model_dir:
|
| 482 |
try:
|
| 483 |
+
model_dir = os.path.join(
|
|
|
|
|
|
|
| 484 |
get_project_base_directory(),
|
| 485 |
+
"rag/res/deepdoc")
|
| 486 |
+
self.text_detector = TextDetector(model_dir)
|
| 487 |
+
self.text_recognizer = TextRecognizer(model_dir)
|
| 488 |
except Exception as e:
|
| 489 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
| 490 |
+
self.text_detector = TextDetector(model_dir)
|
| 491 |
+
self.text_recognizer = TextRecognizer(model_dir)
|
| 492 |
|
|
|
|
|
|
|
| 493 |
self.drop_score = 0.5
|
| 494 |
self.crop_image_res_index = 0
|
| 495 |
|
deepdoc/vision/recognizer.py
CHANGED
|
@@ -36,17 +36,14 @@ class Recognizer(object):
|
|
| 36 |
|
| 37 |
"""
|
| 38 |
if not model_dir:
|
| 39 |
-
|
| 40 |
-
model_dir = snapshot_download(
|
| 41 |
-
repo_id="InfiniFlow/deepdoc",
|
| 42 |
-
local_dir=os.path.join(
|
| 43 |
get_project_base_directory(),
|
| 44 |
-
"rag/res/deepdoc")
|
| 45 |
-
|
| 46 |
-
|
| 47 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
|
|
|
| 48 |
|
| 49 |
-
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
| 50 |
if not os.path.exists(model_file_path):
|
| 51 |
raise ValueError("not find model file path {}".format(
|
| 52 |
model_file_path))
|
|
|
|
| 36 |
|
| 37 |
"""
|
| 38 |
if not model_dir:
|
| 39 |
+
model_dir = os.path.join(
|
|
|
|
|
|
|
|
|
|
| 40 |
get_project_base_directory(),
|
| 41 |
+
"rag/res/deepdoc")
|
| 42 |
+
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
| 43 |
+
if not os.path.exists(model_file_path):
|
| 44 |
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
| 45 |
+
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
| 46 |
|
|
|
|
| 47 |
if not os.path.exists(model_file_path):
|
| 48 |
raise ValueError("not find model file path {}".format(
|
| 49 |
model_file_path))
|
deepdoc/vision/table_structure_recognizer.py
CHANGED
|
@@ -35,17 +35,11 @@ class TableStructureRecognizer(Recognizer):
|
|
| 35 |
|
| 36 |
def __init__(self):
|
| 37 |
try:
|
| 38 |
-
|
| 39 |
-
repo_id="InfiniFlow/deepdoc",
|
| 40 |
-
local_dir=os.path.join(
|
| 41 |
get_project_base_directory(),
|
| 42 |
-
"rag/res/deepdoc")
|
| 43 |
-
local_files_only=True)
|
| 44 |
except Exception as e:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
| 48 |
-
super().__init__(self.labels, "tsr", model_dir)
|
| 49 |
|
| 50 |
def __call__(self, images, thr=0.2):
|
| 51 |
tbls = super().__call__(images, thr)
|
|
|
|
| 35 |
|
| 36 |
def __init__(self):
|
| 37 |
try:
|
| 38 |
+
super().__init__(self.labels, "tsr", os.path.join(
|
|
|
|
|
|
|
| 39 |
get_project_base_directory(),
|
| 40 |
+
"rag/res/deepdoc"))
|
|
|
|
| 41 |
except Exception as e:
|
| 42 |
+
super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc"))
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def __call__(self, images, thr=0.2):
|
| 45 |
tbls = super().__call__(images, thr)
|
rag/llm/embedding_model.py
CHANGED
|
@@ -28,16 +28,13 @@ from api.utils.file_utils import get_project_base_directory
|
|
| 28 |
from rag.utils import num_tokens_from_string
|
| 29 |
|
| 30 |
try:
|
| 31 |
-
|
| 32 |
-
repo_id="BAAI/bge-large-zh-v1.5",
|
| 33 |
-
local_dir=os.path.join(
|
| 34 |
get_project_base_directory(),
|
| 35 |
"rag/res/bge-large-zh-v1.5"),
|
| 36 |
-
|
|
|
|
| 37 |
except Exception as e:
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
flag_model = FlagModel(model_dir,
|
| 41 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
| 42 |
use_fp16=torch.cuda.is_available())
|
| 43 |
|
|
|
|
| 28 |
from rag.utils import num_tokens_from_string
|
| 29 |
|
| 30 |
try:
|
| 31 |
+
flag_model = FlagModel(os.path.join(
|
|
|
|
|
|
|
| 32 |
get_project_base_directory(),
|
| 33 |
"rag/res/bge-large-zh-v1.5"),
|
| 34 |
+
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
| 35 |
+
use_fp16=torch.cuda.is_available())
|
| 36 |
except Exception as e:
|
| 37 |
+
flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
|
|
|
|
|
|
|
| 38 |
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
| 39 |
use_fp16=torch.cuda.is_available())
|
| 40 |
|
rag/nlp/search.py
CHANGED
|
@@ -247,7 +247,7 @@ class Dealer:
|
|
| 247 |
for ck in chunks]
|
| 248 |
cites = {}
|
| 249 |
thr = 0.63
|
| 250 |
-
while len(cites.keys()) == 0 and pieces_ and chunks_tks:
|
| 251 |
for i, a in enumerate(pieces_):
|
| 252 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
| 253 |
chunk_v,
|
|
|
|
| 247 |
for ck in chunks]
|
| 248 |
cites = {}
|
| 249 |
thr = 0.63
|
| 250 |
+
while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
|
| 251 |
for i, a in enumerate(pieces_):
|
| 252 |
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
| 253 |
chunk_v,
|