KevinHuSh commited on
Commit
8f39e7a
·
1 Parent(s): 328b4c9

support snapshot download from local (#153)

Browse files

* support snapshot download from local

* let snapshot download from local

README.md CHANGED
@@ -1,5 +1,5 @@
1
  <div align="center">
2
- <a href="https://ragflow.io/">
3
  <img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
4
  </a>
5
  </div>
@@ -11,7 +11,7 @@
11
  </p>
12
 
13
  <p align="center">
14
- <a href="https://ragflow.io" target="_blank">
15
  <img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
16
  <a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
17
  <img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
@@ -21,7 +21,7 @@
21
  </a>
22
  </p>
23
 
24
- [RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
25
  with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
26
  platform to empower your business with AI.
27
 
@@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again!
119
  > The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
120
  > and change the left part of *'80:80'*'.
121
 
 
 
 
 
 
 
122
  # Configuration
123
  If you need to change the default setting of the system when you deploy it. There several ways to configure it.
124
  Please refer to [README](./docker/README.md) and manually set the configuration.
 
1
  <div align="center">
2
+ <a href="https://demo.ragflow.io/">
3
  <img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
4
  </a>
5
  </div>
 
11
  </p>
12
 
13
  <p align="center">
14
+ <a href="https://demo.ragflow.io" target="_blank">
15
  <img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
16
  <a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
17
  <img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
 
21
  </a>
22
  </p>
23
 
24
+ [RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
25
  with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
26
  platform to empower your business with AI.
27
 
 
119
  > The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
120
  > and change the left part of *'80:80'*'.
121
 
122
+ # System Architecture Diagram
123
+
124
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
125
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/39c8e546-51ca-4b50-a1da-83731b540cd0" width="1000"/>
126
+ </div>
127
+
128
  # Configuration
129
  If you need to change the default setting of the system when you deploy it. There several ways to configure it.
130
  Please refer to [README](./docker/README.md) and manually set the configuration.
api/apps/conversation_app.py CHANGED
@@ -320,8 +320,13 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
320
  rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
321
  docid_idx = list(docid_idx)[0]
322
  docnm_idx = list(docnm_idx)[0]
 
 
 
 
 
323
  return {
324
  "answer": "\n".join([clmns, line, rows]),
325
  "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
326
- "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
327
  }
 
320
  rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
321
  docid_idx = list(docid_idx)[0]
322
  docnm_idx = list(docnm_idx)[0]
323
+ doc_aggs = {}
324
+ for r in tbl["rows"]:
325
+ if r[docid_idx] not in doc_aggs:
326
+ doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0}
327
+ doc_aggs[r[docid_idx]]["count"] += 1
328
  return {
329
  "answer": "\n".join([clmns, line, rows]),
330
  "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
331
+ "doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]}
332
  }
deepdoc/parser/pdf_parser.py CHANGED
@@ -1,4 +1,5 @@
1
  # -*- coding: utf-8 -*-
 
2
  import random
3
 
4
  import fitz
@@ -12,10 +13,12 @@ from PIL import Image, ImageDraw
12
  import numpy as np
13
 
14
  from PyPDF2 import PdfReader as pdf2_read
 
 
15
  from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
16
  from rag.nlp import huqie
17
  from copy import deepcopy
18
- from huggingface_hub import hf_hub_download
19
 
20
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
21
 
@@ -32,8 +35,17 @@ class HuParser:
32
  self.updown_cnt_mdl = xgb.Booster()
33
  if torch.cuda.is_available():
34
  self.updown_cnt_mdl.set_param({"device": "cuda"})
35
- self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
36
- filename="updown_concat_xgb.model"))
 
 
 
 
 
 
 
 
 
37
  self.page_from = 0
38
  """
39
  If you have trouble downloading HuggingFace models, -_^ this might help!!
 
1
  # -*- coding: utf-8 -*-
2
+ import os
3
  import random
4
 
5
  import fitz
 
13
  import numpy as np
14
 
15
  from PyPDF2 import PdfReader as pdf2_read
16
+
17
+ from api.utils.file_utils import get_project_base_directory
18
  from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
19
  from rag.nlp import huqie
20
  from copy import deepcopy
21
+ from huggingface_hub import hf_hub_download, snapshot_download
22
 
23
  logging.getLogger("pdfminer").setLevel(logging.WARNING)
24
 
 
35
  self.updown_cnt_mdl = xgb.Booster()
36
  if torch.cuda.is_available():
37
  self.updown_cnt_mdl.set_param({"device": "cuda"})
38
+ try:
39
+ model_dir = snapshot_download(
40
+ repo_id="InfiniFlow/text_concat_xgb_v1.0",
41
+ local_dir=os.path.join(
42
+ get_project_base_directory(),
43
+ "rag/res/deepdoc"),
44
+ local_files_only=True)
45
+ except Exception as e:
46
+ model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")
47
+
48
+ self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
49
  self.page_from = 0
50
  """
51
  If you have trouble downloading HuggingFace models, -_^ this might help!!
deepdoc/vision/layout_recognizer.py CHANGED
@@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer):
37
  "Equation",
38
  ]
39
  def __init__(self, domain):
40
- model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
 
 
 
 
 
 
 
 
41
  super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
42
  self.garbage_layouts = ["footer", "header", "reference"]
43
 
 
37
  "Equation",
38
  ]
39
  def __init__(self, domain):
40
+ try:
41
+ model_dir = snapshot_download(
42
+ repo_id="InfiniFlow/deepdoc",
43
+ local_dir=os.path.join(
44
+ get_project_base_directory(),
45
+ "rag/res/deepdoc"),
46
+ local_files_only=True)
47
+ except Exception as e:
48
+ model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
49
+
50
  super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
51
  self.garbage_layouts = ["footer", "header", "reference"]
52
 
deepdoc/vision/ocr.py CHANGED
@@ -14,6 +14,10 @@
14
  import copy
15
  import time
16
  import os
 
 
 
 
17
  from .operators import *
18
  import numpy as np
19
  import onnxruntime as ort
@@ -21,6 +25,7 @@ import onnxruntime as ort
21
  from .postprocess import build_post_process
22
  from rag.settings import cron_logger
23
 
 
24
  def transform(data, ops=None):
25
  """ transform """
26
  if ops is None:
@@ -66,9 +71,15 @@ def load_model(model_dir, nm):
66
  options.intra_op_num_threads = 2
67
  options.inter_op_num_threads = 2
68
  if False and ort.get_device() == "GPU":
69
- sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
 
 
 
70
  else:
71
- sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
 
 
 
72
  return sess, sess.get_inputs()[0]
73
 
74
 
@@ -331,7 +342,8 @@ class TextRecognizer(object):
331
  outputs = self.predictor.run(None, input_dict)
332
  break
333
  except Exception as e:
334
- if i >= 3: raise e
 
335
  time.sleep(5)
336
  preds = outputs[0]
337
  rec_result = self.postprocess_op(preds)
@@ -442,7 +454,8 @@ class TextDetector(object):
442
  outputs = self.predictor.run(None, input_dict)
443
  break
444
  except Exception as e:
445
- if i >= 3: raise e
 
446
  time.sleep(5)
447
 
448
  post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
@@ -466,7 +479,15 @@ class OCR(object):
466
 
467
  """
468
  if not model_dir:
469
- model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
 
 
 
 
 
 
 
470
 
471
  self.text_detector = TextDetector(model_dir)
472
  self.text_recognizer = TextRecognizer(model_dir)
@@ -548,14 +569,16 @@ class OCR(object):
548
  cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
549
  len(dt_boxes), elapse))
550
 
551
- return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))])
 
552
 
553
  def recognize(self, ori_im, box):
554
  img_crop = self.get_rotate_crop_image(ori_im, box)
555
 
556
  rec_res, elapse = self.text_recognizer([img_crop])
557
  text, score = rec_res[0]
558
- if score < self.drop_score:return ""
 
559
  return text
560
 
561
  def __call__(self, img, cls=True):
@@ -600,8 +623,7 @@ class OCR(object):
600
  end = time.time()
601
  time_dict['all'] = end - start
602
 
603
-
604
- #for bno in range(len(img_crop_list)):
605
  # print(f"{bno}, {rec_res[bno]}")
606
 
607
  return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
 
14
  import copy
15
  import time
16
  import os
17
+
18
+ from huggingface_hub import snapshot_download
19
+
20
+ from api.utils.file_utils import get_project_base_directory
21
  from .operators import *
22
  import numpy as np
23
  import onnxruntime as ort
 
25
  from .postprocess import build_post_process
26
  from rag.settings import cron_logger
27
 
28
+
29
  def transform(data, ops=None):
30
  """ transform """
31
  if ops is None:
 
71
  options.intra_op_num_threads = 2
72
  options.inter_op_num_threads = 2
73
  if False and ort.get_device() == "GPU":
74
+ sess = ort.InferenceSession(
75
+ model_file_path,
76
+ options=options,
77
+ providers=['CUDAExecutionProvider'])
78
  else:
79
+ sess = ort.InferenceSession(
80
+ model_file_path,
81
+ options=options,
82
+ providers=['CPUExecutionProvider'])
83
  return sess, sess.get_inputs()[0]
84
 
85
 
 
342
  outputs = self.predictor.run(None, input_dict)
343
  break
344
  except Exception as e:
345
+ if i >= 3:
346
+ raise e
347
  time.sleep(5)
348
  preds = outputs[0]
349
  rec_result = self.postprocess_op(preds)
 
454
  outputs = self.predictor.run(None, input_dict)
455
  break
456
  except Exception as e:
457
+ if i >= 3:
458
+ raise e
459
  time.sleep(5)
460
 
461
  post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
 
479
 
480
  """
481
  if not model_dir:
482
+ try:
483
+ model_dir = snapshot_download(
484
+ repo_id="InfiniFlow/deepdoc",
485
+ local_dir=os.path.join(
486
+ get_project_base_directory(),
487
+ "rag/res/deepdoc"),
488
+ local_files_only=True)
489
+ except Exception as e:
490
+ model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
491
 
492
  self.text_detector = TextDetector(model_dir)
493
  self.text_recognizer = TextRecognizer(model_dir)
 
569
  cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
570
  len(dt_boxes), elapse))
571
 
572
+ return zip(self.sorted_boxes(dt_boxes), [
573
+ ("", 0) for _ in range(len(dt_boxes))])
574
 
575
  def recognize(self, ori_im, box):
576
  img_crop = self.get_rotate_crop_image(ori_im, box)
577
 
578
  rec_res, elapse = self.text_recognizer([img_crop])
579
  text, score = rec_res[0]
580
+ if score < self.drop_score:
581
+ return ""
582
  return text
583
 
584
  def __call__(self, img, cls=True):
 
623
  end = time.time()
624
  time_dict['all'] = end - start
625
 
626
+ # for bno in range(len(img_crop_list)):
 
627
  # print(f"{bno}, {rec_res[bno]}")
628
 
629
  return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
deepdoc/vision/recognizer.py CHANGED
@@ -17,6 +17,7 @@ from copy import deepcopy
17
  import onnxruntime as ort
18
  from huggingface_hub import snapshot_download
19
 
 
20
  from .operators import *
21
  from rag.settings import cron_logger
22
 
@@ -35,7 +36,15 @@ class Recognizer(object):
35
 
36
  """
37
  if not model_dir:
38
- model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
 
 
 
 
 
 
 
39
 
40
  model_file_path = os.path.join(model_dir, task_name + ".onnx")
41
  if not os.path.exists(model_file_path):
 
17
  import onnxruntime as ort
18
  from huggingface_hub import snapshot_download
19
 
20
+ from api.utils.file_utils import get_project_base_directory
21
  from .operators import *
22
  from rag.settings import cron_logger
23
 
 
36
 
37
  """
38
  if not model_dir:
39
+ try:
40
+ model_dir = snapshot_download(
41
+ repo_id="InfiniFlow/deepdoc",
42
+ local_dir=os.path.join(
43
+ get_project_base_directory(),
44
+ "rag/res/deepdoc"),
45
+ local_files_only=True)
46
+ except Exception as e:
47
+ model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
48
 
49
  model_file_path = os.path.join(model_dir, task_name + ".onnx")
50
  if not os.path.exists(model_file_path):
deepdoc/vision/table_structure_recognizer.py CHANGED
@@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer):
34
  ]
35
 
36
  def __init__(self):
37
- model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
 
 
 
 
 
 
 
 
 
38
  super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
39
 
40
  def __call__(self, images, thr=0.2):
 
34
  ]
35
 
36
  def __init__(self):
37
+ try:
38
+ model_dir = snapshot_download(
39
+ repo_id="InfiniFlow/deepdoc",
40
+ local_dir=os.path.join(
41
+ get_project_base_directory(),
42
+ "rag/res/deepdoc"),
43
+ local_files_only=True)
44
+ except Exception as e:
45
+ model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
46
+
47
  super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
48
 
49
  def __call__(self, images, thr=0.2):
docker/README.md CHANGED
@@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until
67
  Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
68
 
69
  ### factory
70
- The LLM suppliers. '通义千问', "OpenAI" and "智谱AI" are supported.
71
 
72
  ### api_key
73
  The corresponding API key of your assigned LLM vendor.
 
67
  Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
68
 
69
  ### factory
70
+ The LLM suppliers. 'Tongyi-Qianwen', "OpenAI" "Moonshot" and "ZHIPU-AI" are supported.
71
 
72
  ### api_key
73
  The corresponding API key of your assigned LLM vendor.
docker/entrypoint.sh CHANGED
@@ -29,7 +29,7 @@ function task_bro(){
29
 
30
  task_bro &
31
 
32
- WS=8
33
  for ((i=0;i<WS;i++))
34
  do
35
  task_exe $i $WS &
 
29
 
30
  task_bro &
31
 
32
+ WS=2
33
  for ((i=0;i<WS;i++))
34
  do
35
  task_exe $i $WS &
docker/service_conf.yaml CHANGED
@@ -16,7 +16,7 @@ minio:
16
  es:
17
  hosts: 'http://es01:9200'
18
  user_default_llm:
19
- factory: '通义千问'
20
  api_key: 'sk-xxxxxxxxxxxxx'
21
  oauth:
22
  github:
 
16
  es:
17
  hosts: 'http://es01:9200'
18
  user_default_llm:
19
+ factory: 'Tongyi-Qianwen'
20
  api_key: 'sk-xxxxxxxxxxxxx'
21
  oauth:
22
  github:
rag/llm/embedding_model.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  from abc import ABC
17
 
18
  import dashscope
@@ -21,9 +22,21 @@ from FlagEmbedding import FlagModel
21
  import torch
22
  import numpy as np
23
  from huggingface_hub import snapshot_download
 
 
24
  from rag.utils import num_tokens_from_string
25
 
26
- flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
 
 
 
 
 
 
 
 
 
 
27
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
28
  use_fp16=torch.cuda.is_available())
29
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ import os
17
  from abc import ABC
18
 
19
  import dashscope
 
22
  import torch
23
  import numpy as np
24
  from huggingface_hub import snapshot_download
25
+
26
+ from api.utils.file_utils import get_project_base_directory
27
  from rag.utils import num_tokens_from_string
28
 
29
+ try:
30
+ model_dir = snapshot_download(
31
+ repo_id="BAAI/bge-large-zh-v1.5",
32
+ local_dir=os.path.join(
33
+ get_project_base_directory(),
34
+ "rag/res/bge-large-zh-v1.5"),
35
+ local_files_only=True)
36
+ except Exception as e:
37
+ model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")
38
+
39
+ flag_model = FlagModel(model_dir,
40
  query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
41
  use_fp16=torch.cuda.is_available())
42
 
rag/svr/task_executor.py CHANGED
@@ -172,7 +172,7 @@ def init_kb(row):
172
  def embedding(docs, mdl, parser_config={}, callback=None):
173
  batch_size = 32
174
  tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
175
- d["content_with_weight"] for d in docs]
176
  tk_count = 0
177
  if len(tts) == len(cnts):
178
  tts_ = np.array([])
 
172
  def embedding(docs, mdl, parser_config={}, callback=None):
173
  batch_size = 32
174
  tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
175
+ re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
176
  tk_count = 0
177
  if len(tts) == len(cnts):
178
  tts_ = np.array([])