KevinHuSh commited on
Commit
8ee4f9f
·
1 Parent(s): 3a31a8a

let's load model from local (#163)

Browse files
deepdoc/parser/pdf_parser.py CHANGED
@@ -18,7 +18,7 @@ from api.utils.file_utils import get_project_base_directory
18
  from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
19
  from rag.nlp import huqie
20
  from copy import deepcopy
21
- from huggingface_hub import hf_hub_download, snapshot_download
22
 
23
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
24
 
@@ -36,18 +36,18 @@ class HuParser:
36
  if torch.cuda.is_available():
37
  self.updown_cnt_mdl.set_param({"device": "cuda"})
38
  try:
39
- model_dir = snapshot_download(
40
- repo_id="InfiniFlow/text_concat_xgb_v1.0",
41
- local_dir=os.path.join(
42
  get_project_base_directory(),
43
- "rag/res/deepdoc"),
44
- local_files_only=True)
 
45
  except Exception as e:
46
  model_dir = snapshot_download(
47
  repo_id="InfiniFlow/text_concat_xgb_v1.0")
 
 
 
48
 
49
- self.updown_cnt_mdl.load_model(os.path.join(
50
- model_dir, "updown_concat_xgb.model"))
51
  self.page_from = 0
52
  """
53
  If you have trouble downloading HuggingFace models, -_^ this might help!!
 
18
  from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
19
  from rag.nlp import huqie
20
  from copy import deepcopy
21
+ from huggingface_hub import snapshot_download
22
 
23
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
24
 
 
36
  if torch.cuda.is_available():
37
  self.updown_cnt_mdl.set_param({"device": "cuda"})
38
  try:
39
+ model_dir = os.path.join(
 
 
40
  get_project_base_directory(),
41
+ "rag/res/deepdoc")
42
+ self.updown_cnt_mdl.load_model(os.path.join(
43
+ model_dir, "updown_concat_xgb.model"))
44
  except Exception as e:
45
  model_dir = snapshot_download(
46
  repo_id="InfiniFlow/text_concat_xgb_v1.0")
47
+ self.updown_cnt_mdl.load_model(os.path.join(
48
+ model_dir, "updown_concat_xgb.model"))
49
+
50
 
 
 
51
  self.page_from = 0
52
  """
53
  If you have trouble downloading HuggingFace models, -_^ this might help!!
deepdoc/vision/layout_recognizer.py CHANGED
@@ -17,7 +17,6 @@ from copy import deepcopy
17
  import numpy as np
18
  from huggingface_hub import snapshot_download
19
 
20
- from api.db import ParserType
21
  from api.utils.file_utils import get_project_base_directory
22
  from deepdoc.vision import Recognizer
23
 
@@ -39,17 +38,14 @@ class LayoutRecognizer(Recognizer):
39
 
40
  def __init__(self, domain):
41
  try:
42
- model_dir = snapshot_download(
43
- repo_id="InfiniFlow/deepdoc",
44
- local_dir=os.path.join(
45
  get_project_base_directory(),
46
- "rag/res/deepdoc"),
47
- local_files_only=True)
48
  except Exception as e:
49
  model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
50
 
51
- # os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
52
- super().__init__(self.labels, domain, model_dir)
53
  self.garbage_layouts = ["footer", "header", "reference"]
54
 
55
  def __call__(self, image_list, ocr_res, scale_factor=3,
 
17
  import numpy as np
18
  from huggingface_hub import snapshot_download
19
 
 
20
  from api.utils.file_utils import get_project_base_directory
21
  from deepdoc.vision import Recognizer
22
 
 
38
 
39
  def __init__(self, domain):
40
  try:
41
+ model_dir = os.path.join(
 
 
42
  get_project_base_directory(),
43
+ "rag/res/deepdoc")
44
+ super().__init__(self.labels, domain, model_dir)
45
  except Exception as e:
46
  model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
47
+ super().__init__(self.labels, domain, model_dir)
48
 
 
 
49
  self.garbage_layouts = ["footer", "header", "reference"]
50
 
51
  def __call__(self, image_list, ocr_res, scale_factor=3,
deepdoc/vision/ocr.py CHANGED
@@ -480,17 +480,16 @@ class OCR(object):
480
  """
481
  if not model_dir:
482
  try:
483
- model_dir = snapshot_download(
484
- repo_id="InfiniFlow/deepdoc",
485
- local_dir=os.path.join(
486
  get_project_base_directory(),
487
- "rag/res/deepdoc"),
488
- local_files_only=True)
 
489
  except Exception as e:
490
  model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
 
491
 
492
- self.text_detector = TextDetector(model_dir)
493
- self.text_recognizer = TextRecognizer(model_dir)
494
  self.drop_score = 0.5
495
  self.crop_image_res_index = 0
496
 
 
480
  """
481
  if not model_dir:
482
  try:
483
+ model_dir = os.path.join(
 
 
484
  get_project_base_directory(),
485
+ "rag/res/deepdoc")
486
+ self.text_detector = TextDetector(model_dir)
487
+ self.text_recognizer = TextRecognizer(model_dir)
488
  except Exception as e:
489
  model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
490
+ self.text_detector = TextDetector(model_dir)
491
+ self.text_recognizer = TextRecognizer(model_dir)
492
 
 
 
493
  self.drop_score = 0.5
494
  self.crop_image_res_index = 0
495
 
deepdoc/vision/recognizer.py CHANGED
@@ -36,17 +36,14 @@ class Recognizer(object):
36
 
37
  """
38
  if not model_dir:
39
- try:
40
- model_dir = snapshot_download(
41
- repo_id="InfiniFlow/deepdoc",
42
- local_dir=os.path.join(
43
  get_project_base_directory(),
44
- "rag/res/deepdoc"),
45
- local_files_only=True)
46
- except Exception as e:
47
  model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
48
 
49
- model_file_path = os.path.join(model_dir, task_name + ".onnx")
50
  if not os.path.exists(model_file_path):
51
  raise ValueError("not find model file path {}".format(
52
  model_file_path))
 
36
 
37
  """
38
  if not model_dir:
39
+ model_dir = os.path.join(
 
 
 
40
  get_project_base_directory(),
41
+ "rag/res/deepdoc")
42
+ model_file_path = os.path.join(model_dir, task_name + ".onnx")
43
+ if not os.path.exists(model_file_path):
44
  model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
45
+ model_file_path = os.path.join(model_dir, task_name + ".onnx")
46
 
 
47
  if not os.path.exists(model_file_path):
48
  raise ValueError("not find model file path {}".format(
49
  model_file_path))
deepdoc/vision/table_structure_recognizer.py CHANGED
@@ -35,17 +35,11 @@ class TableStructureRecognizer(Recognizer):
35
 
36
  def __init__(self):
37
  try:
38
- model_dir = snapshot_download(
39
- repo_id="InfiniFlow/deepdoc",
40
- local_dir=os.path.join(
41
  get_project_base_directory(),
42
- "rag/res/deepdoc"),
43
- local_files_only=True)
44
  except Exception as e:
45
- model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
46
-
47
- # os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
48
- super().__init__(self.labels, "tsr", model_dir)
49
 
50
  def __call__(self, images, thr=0.2):
51
  tbls = super().__call__(images, thr)
 
35
 
36
  def __init__(self):
37
  try:
38
+ super().__init__(self.labels, "tsr", os.path.join(
 
 
39
  get_project_base_directory(),
40
+ "rag/res/deepdoc"))
 
41
  except Exception as e:
42
+ super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc"))
 
 
 
43
 
44
  def __call__(self, images, thr=0.2):
45
  tbls = super().__call__(images, thr)
rag/llm/embedding_model.py CHANGED
@@ -28,16 +28,13 @@ from api.utils.file_utils import get_project_base_directory
28
  from rag.utils import num_tokens_from_string
29
 
30
  try:
31
- model_dir = snapshot_download(
32
- repo_id="BAAI/bge-large-zh-v1.5",
33
- local_dir=os.path.join(
34
  get_project_base_directory(),
35
  "rag/res/bge-large-zh-v1.5"),
36
- local_files_only=True)
 
37
  except Exception as e:
38
- model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")
39
-
40
- flag_model = FlagModel(model_dir,
41
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
42
  use_fp16=torch.cuda.is_available())
43
 
 
28
  from rag.utils import num_tokens_from_string
29
 
30
  try:
31
+ flag_model = FlagModel(os.path.join(
 
 
32
  get_project_base_directory(),
33
  "rag/res/bge-large-zh-v1.5"),
34
+ query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
35
+ use_fp16=torch.cuda.is_available())
36
  except Exception as e:
37
+ flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
 
 
38
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
39
  use_fp16=torch.cuda.is_available())
40
 
rag/nlp/search.py CHANGED
@@ -247,7 +247,7 @@ class Dealer:
247
  for ck in chunks]
248
  cites = {}
249
  thr = 0.63
250
- while len(cites.keys()) == 0 and pieces_ and chunks_tks:
251
  for i, a in enumerate(pieces_):
252
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
253
  chunk_v,
 
247
  for ck in chunks]
248
  cites = {}
249
  thr = 0.63
250
+ while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
251
  for i, a in enumerate(pieces_):
252
  sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
253
  chunk_v,