Spaces:

retopara
/

ragflow

Build error

App Files Files Community

KevinHuSh commited on Mar 27, 2024

Commit

8f39e7a

1 Parent(s): 328b4c9

support snapshot download from local (#153)

Browse files

* support snapshot download from local

* let snapshot download from local

Files changed (12) hide show

README.md +9 -3
api/apps/conversation_app.py +6 -1
deepdoc/parser/pdf_parser.py +15 -3
deepdoc/vision/layout_recognizer.py +10 -1
deepdoc/vision/ocr.py +31 -9
deepdoc/vision/recognizer.py +10 -1
deepdoc/vision/table_structure_recognizer.py +10 -1
docker/README.md +1 -1
docker/entrypoint.sh +1 -1
docker/service_conf.yaml +1 -1
rag/llm/embedding_model.py +14 -1
rag/svr/task_executor.py +1 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 <div align="center">
-<a href="https://ragflow.io/">
 <img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
 </a>
 </div>
@@ -11,7 +11,7 @@
 </p>
 <p align="center">
-    <a href="https://ragflow.io" target="_blank">
         <img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
     <a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
         <img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
@@ -21,7 +21,7 @@
   </a>
 </p>
-[RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
 with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
 platform to empower your business with AI.
@@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again!
 > The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
 > and change the left part of *'80:80'*'.
 # Configuration
 If you need to change the default setting of the system when you deploy it. There several ways to configure it.
 Please refer to [README](./docker/README.md) and manually set the configuration.

 <div align="center">
+<a href="https://demo.ragflow.io/">
 <img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
 </a>
 </div>
 </p>
 <p align="center">
+    <a href="https://demo.ragflow.io" target="_blank">
         <img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
     <a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
         <img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
   </a>
 </p>
+[RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
 with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
 platform to empower your business with AI.
 > The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
 > and change the left part of *'80:80'*'.
+# System Architecture Diagram
+<div align="center" style="margin-top:20px;margin-bottom:20px;">
+<img src="https://github.com/infiniflow/ragflow/assets/12318111/39c8e546-51ca-4b50-a1da-83731b540cd0" width="1000"/>
+</div>
 # Configuration
 If you need to change the default setting of the system when you deploy it. There several ways to configure it.
 Please refer to [README](./docker/README.md) and manually set the configuration.

api/apps/conversation_app.py CHANGED Viewed

@@ -320,8 +320,13 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
     rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
     docid_idx = list(docid_idx)[0]
     docnm_idx = list(docnm_idx)[0]
     return {
         "answer": "\n".join([clmns, line, rows]),
         "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
-                      "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
     }

     rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
     docid_idx = list(docid_idx)[0]
     docnm_idx = list(docnm_idx)[0]
+    doc_aggs = {}
+    for r in tbl["rows"]:
+        if r[docid_idx] not in doc_aggs:
+            doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0}
+        doc_aggs[r[docid_idx]]["count"] += 1
     return {
         "answer": "\n".join([clmns, line, rows]),
         "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
+                      "doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]}
     }

deepdoc/parser/pdf_parser.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 import random
 import fitz
@@ -12,10 +13,12 @@ from PIL import Image, ImageDraw
 import numpy as np
 from PyPDF2 import PdfReader as pdf2_read
 from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
 from rag.nlp import huqie
 from copy import deepcopy
-from huggingface_hub import hf_hub_download
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
@@ -32,8 +35,17 @@ class HuParser:
         self.updown_cnt_mdl = xgb.Booster()
         if torch.cuda.is_available():
             self.updown_cnt_mdl.set_param({"device": "cuda"})
-        self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
-                                                       filename="updown_concat_xgb.model"))
         self.page_from = 0
         """
         If you have trouble downloading HuggingFace models, -_^ this might help!!

 # -*- coding: utf-8 -*-
+import os
 import random
 import fitz
 import numpy as np
 from PyPDF2 import PdfReader as pdf2_read
+from api.utils.file_utils import get_project_base_directory
 from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
 from rag.nlp import huqie
 from copy import deepcopy
+from huggingface_hub import hf_hub_download, snapshot_download
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
         self.updown_cnt_mdl = xgb.Booster()
         if torch.cuda.is_available():
             self.updown_cnt_mdl.set_param({"device": "cuda"})
+        try:
+            model_dir = snapshot_download(
+                repo_id="InfiniFlow/text_concat_xgb_v1.0",
+                local_dir=os.path.join(
+                    get_project_base_directory(),
+                    "rag/res/deepdoc"),
+                local_files_only=True)
+        except Exception as e:
+            model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")
+        self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
         self.page_from = 0
         """
         If you have trouble downloading HuggingFace models, -_^ this might help!!

deepdoc/vision/layout_recognizer.py CHANGED Viewed

@@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer):
              "Equation",
         ]
     def __init__(self, domain):
-        model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
         self.garbage_layouts = ["footer", "header", "reference"]

              "Equation",
         ]
     def __init__(self, domain):
+        try:
+            model_dir = snapshot_download(
+                repo_id="InfiniFlow/deepdoc",
+                local_dir=os.path.join(
+                    get_project_base_directory(),
+                    "rag/res/deepdoc"),
+                local_files_only=True)
+        except Exception as e:
+            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
         self.garbage_layouts = ["footer", "header", "reference"]

deepdoc/vision/ocr.py CHANGED Viewed

@@ -14,6 +14,10 @@
 import copy
 import time
 import os
 from .operators import *
 import numpy as np
 import onnxruntime as ort
@@ -21,6 +25,7 @@ import onnxruntime as ort
 from .postprocess import build_post_process
 from rag.settings import cron_logger
 def transform(data, ops=None):
     """ transform """
     if ops is None:
@@ -66,9 +71,15 @@ def load_model(model_dir, nm):
     options.intra_op_num_threads = 2
     options.inter_op_num_threads = 2
     if False and ort.get_device() == "GPU":
-        sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
     else:
-        sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
     return sess, sess.get_inputs()[0]
@@ -331,7 +342,8 @@ class TextRecognizer(object):
                     outputs = self.predictor.run(None, input_dict)
                     break
                 except Exception as e:
-                    if i >= 3: raise e
                     time.sleep(5)
             preds = outputs[0]
             rec_result = self.postprocess_op(preds)
@@ -442,7 +454,8 @@ class TextDetector(object):
                 outputs = self.predictor.run(None, input_dict)
                 break
             except Exception as e:
-                if i >= 3: raise e
                 time.sleep(5)
         post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
@@ -466,7 +479,15 @@ class OCR(object):
         """
         if not model_dir:
-            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         self.text_detector = TextDetector(model_dir)
         self.text_recognizer = TextRecognizer(model_dir)
@@ -548,14 +569,16 @@ class OCR(object):
             cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
                 len(dt_boxes), elapse))
-        return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))])
     def recognize(self, ori_im, box):
         img_crop = self.get_rotate_crop_image(ori_im, box)
         rec_res, elapse = self.text_recognizer([img_crop])
         text, score = rec_res[0]
-        if score < self.drop_score:return ""
         return text
     def __call__(self, img, cls=True):
@@ -600,8 +623,7 @@ class OCR(object):
         end = time.time()
         time_dict['all'] = end - start
-        #for bno in range(len(img_crop_list)):
         #    print(f"{bno}, {rec_res[bno]}")
         return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))

 import copy
 import time
 import os
+from huggingface_hub import snapshot_download
+from api.utils.file_utils import get_project_base_directory
 from .operators import *
 import numpy as np
 import onnxruntime as ort
 from .postprocess import build_post_process
 from rag.settings import cron_logger
 def transform(data, ops=None):
     """ transform """
     if ops is None:
     options.intra_op_num_threads = 2
     options.inter_op_num_threads = 2
     if False and ort.get_device() == "GPU":
+        sess = ort.InferenceSession(
+            model_file_path,
+            options=options,
+            providers=['CUDAExecutionProvider'])
     else:
+        sess = ort.InferenceSession(
+            model_file_path,
+            options=options,
+            providers=['CPUExecutionProvider'])
     return sess, sess.get_inputs()[0]
                     outputs = self.predictor.run(None, input_dict)
                     break
                 except Exception as e:
+                    if i >= 3:
+                        raise e
                     time.sleep(5)
             preds = outputs[0]
             rec_result = self.postprocess_op(preds)
                 outputs = self.predictor.run(None, input_dict)
                 break
             except Exception as e:
+                if i >= 3:
+                    raise e
                 time.sleep(5)
         post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
         """
         if not model_dir:
+            try:
+                model_dir = snapshot_download(
+                    repo_id="InfiniFlow/deepdoc",
+                    local_dir=os.path.join(
+                        get_project_base_directory(),
+                        "rag/res/deepdoc"),
+                    local_files_only=True)
+            except Exception as e:
+                model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         self.text_detector = TextDetector(model_dir)
         self.text_recognizer = TextRecognizer(model_dir)
             cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
                 len(dt_boxes), elapse))
+        return zip(self.sorted_boxes(dt_boxes), [
+                   ("", 0) for _ in range(len(dt_boxes))])
     def recognize(self, ori_im, box):
         img_crop = self.get_rotate_crop_image(ori_im, box)
         rec_res, elapse = self.text_recognizer([img_crop])
         text, score = rec_res[0]
+        if score < self.drop_score:
+            return ""
         return text
     def __call__(self, img, cls=True):
         end = time.time()
         time_dict['all'] = end - start
+        # for bno in range(len(img_crop_list)):
         #    print(f"{bno}, {rec_res[bno]}")
         return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))

deepdoc/vision/recognizer.py CHANGED Viewed

@@ -17,6 +17,7 @@ from copy import deepcopy
 import onnxruntime as ort
 from huggingface_hub import snapshot_download
 from .operators import *
 from rag.settings import cron_logger
@@ -35,7 +36,15 @@ class Recognizer(object):
         """
         if not model_dir:
-            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         model_file_path = os.path.join(model_dir, task_name + ".onnx")
         if not os.path.exists(model_file_path):

 import onnxruntime as ort
 from huggingface_hub import snapshot_download
+from api.utils.file_utils import get_project_base_directory
 from .operators import *
 from rag.settings import cron_logger
         """
         if not model_dir:
+            try:
+                model_dir = snapshot_download(
+                    repo_id="InfiniFlow/deepdoc",
+                    local_dir=os.path.join(
+                        get_project_base_directory(),
+                        "rag/res/deepdoc"),
+                    local_files_only=True)
+            except Exception as e:
+                model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         model_file_path = os.path.join(model_dir, task_name + ".onnx")
         if not os.path.exists(model_file_path):

deepdoc/vision/table_structure_recognizer.py CHANGED Viewed

@@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer):
     ]
     def __init__(self):
-        model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
     def __call__(self, images, thr=0.2):

     ]
     def __init__(self):
+        try:
+            model_dir = snapshot_download(
+                repo_id="InfiniFlow/deepdoc",
+                local_dir=os.path.join(
+                    get_project_base_directory(),
+                    "rag/res/deepdoc"),
+                local_files_only=True)
+        except Exception as e:
+            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
         super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
     def __call__(self, images, thr=0.2):

docker/README.md CHANGED Viewed

@@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until
 Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
 ### factory
-The LLM suppliers. '通义千问', "OpenAI" and "智谱AI" are supported.
 ### api_key
 The corresponding API key of your assigned LLM vendor.

 Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
 ### factory
+The LLM suppliers. 'Tongyi-Qianwen', "OpenAI"， "Moonshot" and "ZHIPU-AI" are supported.
 ### api_key
 The corresponding API key of your assigned LLM vendor.

docker/entrypoint.sh CHANGED Viewed

@@ -29,7 +29,7 @@ function task_bro(){
 task_bro &
-WS=8
 for ((i=0;i<WS;i++))
 do
   task_exe $i $WS &

 task_bro &
+WS=2
 for ((i=0;i<WS;i++))
 do
   task_exe $i $WS &

docker/service_conf.yaml CHANGED Viewed

@@ -16,7 +16,7 @@ minio:
 es:
   hosts: 'http://es01:9200'
 user_default_llm:
-  factory: '通义千问'
   api_key: 'sk-xxxxxxxxxxxxx'
 oauth:
   github:

 es:
   hosts: 'http://es01:9200'
 user_default_llm:
+  factory: 'Tongyi-Qianwen'
   api_key: 'sk-xxxxxxxxxxxxx'
 oauth:
   github:

rag/llm/embedding_model.py CHANGED Viewed

@@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from abc import ABC
 import dashscope
@@ -21,9 +22,21 @@ from FlagEmbedding import FlagModel
 import torch
 import numpy as np
 from huggingface_hub import snapshot_download
 from rag.utils import num_tokens_from_string
-flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
                        query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                        use_fp16=torch.cuda.is_available())

 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import os
 from abc import ABC
 import dashscope
 import torch
 import numpy as np
 from huggingface_hub import snapshot_download
+from api.utils.file_utils import get_project_base_directory
 from rag.utils import num_tokens_from_string
+try:
+    model_dir = snapshot_download(
+        repo_id="BAAI/bge-large-zh-v1.5",
+        local_dir=os.path.join(
+            get_project_base_directory(),
+            "rag/res/bge-large-zh-v1.5"),
+        local_files_only=True)
+except Exception as e:
+    model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")
+flag_model = FlagModel(model_dir,
                        query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                        use_fp16=torch.cuda.is_available())

rag/svr/task_executor.py CHANGED Viewed

@@ -172,7 +172,7 @@ def init_kb(row):
 def embedding(docs, mdl, parser_config={}, callback=None):
     batch_size = 32
     tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
-        d["content_with_weight"] for d in docs]
     tk_count = 0
     if len(tts) == len(cnts):
         tts_ = np.array([])

 def embedding(docs, mdl, parser_config={}, callback=None):
     batch_size = 32
     tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
+        re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
     tk_count = 0
     if len(tts) == len(cnts):
         tts_ = np.array([])