Spaces:

retopara
/

ragflow

Build error

fkzhao Kevin Hu commited on May 7, 2024

Commit

1d93b24

1 Parent(s): 2aafb30

optimize srv broker and executor logic (#630)

### What problem does this PR solve?

Optimize task broker and executor for reduce memory usage and deployment
complexity.

### Type of change
- [x] Performance Improvement
- [x] Refactoring

### Change Log
- Enhance redis utils for message queue(use stream)
- Modify task broker logic via message queue (1.get parse event from
message queue 2.use ThreadPoolExecutor async executor )
- Modify the table column name of document and task (process_duation ->
process_duration maybe just a spelling mistake)
- Reformat some code style(just what i see)
- Add requirement_dev.txt for developer
- Add redis container on docker compose

---------

Co-authored-by: Kevin Hu <[email protected]>

Files changed (20) hide show

.gitignore +2 -0
api/apps/document_app.py +12 -1
api/db/services/document_service.py +58 -5
api/db/services/task_service.py +71 -26
api/ragflow_server.py +17 -0
conf/service_conf.yaml +4 -0
docker/.env +2 -0
docker/README.md +1 -1
docker/docker-compose-base.yml +13 -21
docker/entrypoint.sh +4 -19
docker/service_conf.yaml +5 -1
docs/conversation_api.md +1 -1
rag/llm/embedding_model.py +1 -2
rag/nlp/query.py +1 -2
rag/settings.py +6 -0
rag/svr/task_broker.py +0 -189
rag/svr/task_executor.py +22 -29
rag/utils/redis_conn.py +67 -2
requirements.txt +1 -1
requirements_dev.txt +126 -0

.gitignore CHANGED Viewed

@@ -27,3 +27,5 @@ Cargo.lock
 # Exclude the log folder
 docker/ragflow-logs/

 # Exclude the log folder
 docker/ragflow-logs/
+/flask_session
+/logs

api/apps/document_app.py CHANGED Viewed

@@ -14,7 +14,6 @@
 #  limitations under the License
 #
-import base64
 import os
 import pathlib
 import re
@@ -24,8 +23,10 @@ from elasticsearch_dsl import Q
 from flask import request
 from flask_login import login_required, current_user
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from api.db.services import duplicate_name
@@ -37,7 +38,9 @@ from api.db.services.document_service import DocumentService
 from api.settings import RetCode
 from api.utils.api_utils import get_json_result
 from rag.utils.minio_conn import MINIO
 from api.utils.file_utils import filename_type, thumbnail
 @manager.route('/upload', methods=['POST'])
@@ -277,6 +280,14 @@ def run():
                 return get_data_error_result(retmsg="Tenant not found!")
             ELASTICSEARCH.deleteByQuery(
                 Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
         return get_json_result(data=True)
     except Exception as e:

 #  limitations under the License
 #
 import os
 import pathlib
 import re
 from flask import request
 from flask_login import login_required, current_user
+from api.db.db_models import Task
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
+from api.db.services.task_service import TaskService, queue_tasks
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from api.db.services import duplicate_name
 from api.settings import RetCode
 from api.utils.api_utils import get_json_result
 from rag.utils.minio_conn import MINIO
+from rag.utils.redis_conn import REDIS_CONN
 from api.utils.file_utils import filename_type, thumbnail
+from rag.settings import SVR_QUEUE_NAME
 @manager.route('/upload', methods=['POST'])
                 return get_data_error_result(retmsg="Tenant not found!")
             ELASTICSEARCH.deleteByQuery(
                 Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+            if str(req["run"]) == TaskStatus.RUNNING.value:
+                TaskService.filter_delete([Task.doc_id == id])
+                e, doc = DocumentService.get_by_id(id)
+                doc = doc.to_dict()
+                doc["tenant_id"] = tenant_id
+                bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
+                queue_tasks(doc, bucket, name)
         return get_json_result(data=True)
     except Exception as e:

api/db/services/document_service.py CHANGED Viewed

@@ -13,17 +13,18 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from peewee import Expression
 from elasticsearch_dsl import Q
-from api.utils import current_timestamp
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
 from rag.nlp import search
 from api.db import FileType, TaskStatus
-from api.db.db_models import DB, Knowledgebase, Tenant
 from api.db.db_models import Document
 from api.db.services.common_service import CommonService
 from api.db.services.knowledgebase_service import KnowledgebaseService
@@ -92,7 +93,7 @@ class DocumentService(CommonService):
     @classmethod
     @DB.connection_context()
-    def get_newly_uploaded(cls, tm):
         fields = [
             cls.model.id,
             cls.model.kb_id,
@@ -196,3 +197,55 @@ class DocumentService(CommonService):
                                                    on=(Knowledgebase.id == cls.model.kb_id)).where(
             Knowledgebase.tenant_id == tenant_id)
         return len(docs)

 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import random
+from datetime import datetime
 from elasticsearch_dsl import Q
+from api.settings import stat_logger
+from api.utils import current_timestamp, get_format_time
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
 from rag.nlp import search
 from api.db import FileType, TaskStatus
+from api.db.db_models import DB, Knowledgebase, Tenant, Task
 from api.db.db_models import Document
 from api.db.services.common_service import CommonService
 from api.db.services.knowledgebase_service import KnowledgebaseService
     @classmethod
     @DB.connection_context()
+    def get_newly_uploaded(cls):
         fields = [
             cls.model.id,
             cls.model.kb_id,
                                                    on=(Knowledgebase.id == cls.model.kb_id)).where(
             Knowledgebase.tenant_id == tenant_id)
         return len(docs)
+    @classmethod
+    @DB.connection_context()
+    def begin2parse(cls, docid):
+        cls.update_by_id(
+            docid, {"progress": random.random() * 1 / 100.,
+                    "progress_msg": "Task dispatched...",
+                    "process_begin_at": get_format_time()
+                    })
+    @classmethod
+    @DB.connection_context()
+    def update_progress(cls):
+        docs = cls.get_unfinished_docs()
+        for d in docs:
+            try:
+                tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
+                if not tsks:
+                    continue
+                msg = []
+                prg = 0
+                finished = True
+                bad = 0
+                status = TaskStatus.RUNNING.value
+                for t in tsks:
+                    if 0 <= t.progress < 1:
+                        finished = False
+                    prg += t.progress if t.progress >= 0 else 0
+                    msg.append(t.progress_msg)
+                    if t.progress == -1:
+                        bad += 1
+                prg /= len(tsks)
+                if finished and bad:
+                    prg = -1
+                    status = TaskStatus.FAIL.value
+                elif finished:
+                    status = TaskStatus.DONE.value
+                msg = "\n".join(msg)
+                info = {
+                    "process_duation": datetime.timestamp(
+                        datetime.now()) -
+                                       d["process_begin_at"].timestamp(),
+                    "run": status}
+                if prg != 0:
+                    info["progress"] = prg
+                if msg:
+                    info["progress_msg"] = msg
+                cls.update_by_id(d["id"], info)
+            except Exception as e:
+                stat_logger.error("fetch task exception:" + str(e))

api/db/services/task_service.py CHANGED Viewed

@@ -15,21 +15,24 @@
 #
 import random
-from peewee import Expression, JOIN
 from api.db.db_models import DB, File2Document, File
 from api.db import StatusEnum, FileType, TaskStatus
 from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
-from api.utils import current_timestamp
 class TaskService(CommonService):
     model = Task
-    @classmethod
-    @DB.connection_context()
-    def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
         fields = [
             cls.model.id,
             cls.model.doc_id,
@@ -48,28 +51,18 @@ class TaskService(CommonService):
             Tenant.img2txt_id,
             Tenant.asr_id,
             cls.model.update_time]
-        with DB.lock("get_task", -1):
-            docs = cls.model.select(*fields) \
-                .join(Document, on=(cls.model.doc_id == Document.id)) \
-                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
-                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
-                .where(
-                    Document.status == StatusEnum.VALID.value,
-                    Document.run == TaskStatus.RUNNING.value,
-                    ~(Document.type == FileType.VIRTUAL.value),
-                    cls.model.progress == 0,
-                    #cls.model.update_time >= tm,
-                    #(Expression(cls.model.create_time, "%%", comm) == mod)
-                )\
-                .order_by(cls.model.update_time.asc())\
-                .paginate(0, items_per_page)
-            docs = list(docs.dicts())
-            if not docs: return []
-            if not takeit: return docs
-            cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
-                cls.model.id == docs[0]["id"]).execute()
-            return docs
     @classmethod
     @DB.connection_context()
@@ -112,3 +105,55 @@ class TaskService(CommonService):
             if "progress" in info:
                 cls.model.update(progress=info["progress"]).where(
                     cls.model.id == id).execute()

 #
 import random
+from api.db.db_utils import bulk_insert_into_db
+from deepdoc.parser import PdfParser
+from peewee import JOIN
 from api.db.db_models import DB, File2Document, File
 from api.db import StatusEnum, FileType, TaskStatus
 from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
+from api.utils import current_timestamp, get_uuid
+from deepdoc.parser.excel_parser import RAGFlowExcelParser
+from rag.settings import MINIO, SVR_QUEUE_NAME
+from rag.utils.redis_conn import REDIS_CONN
 class TaskService(CommonService):
     model = Task
+    def get_tasks(cls, task_id):
         fields = [
             cls.model.id,
             cls.model.doc_id,
             Tenant.img2txt_id,
             Tenant.asr_id,
             cls.model.update_time]
+        docs = cls.model.select(*fields) \
+            .join(Document, on=(cls.model.doc_id == Document.id)) \
+            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
+            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
+            .where(cls.model.id == task_id)
+        docs = list(docs.dicts())
+        if not docs: return []
+        cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.",
+                         progress=random.random() / 10.).where(
+            cls.model.id == docs[0]["id"]).execute()
+        return docs
     @classmethod
     @DB.connection_context()
             if "progress" in info:
                 cls.model.update(progress=info["progress"]).where(
                     cls.model.id == id).execute()
+def queue_tasks(doc, bucket, name):
+    def new_task():
+        nonlocal doc
+        return {
+            "id": get_uuid(),
+            "doc_id": doc["id"]
+        }
+    tsks = []
+    if doc["type"] == FileType.PDF.value:
+        file_bin = MINIO.get(bucket, name)
+        do_layout = doc["parser_config"].get("layout_recognize", True)
+        pages = PdfParser.total_page_number(doc["name"], file_bin)
+        page_size = doc["parser_config"].get("task_page_size", 12)
+        if doc["parser_id"] == "paper":
+            page_size = doc["parser_config"].get("task_page_size", 22)
+        if doc["parser_id"] == "one":
+            page_size = 1000000000
+        if not do_layout:
+            page_size = 1000000000
+        page_ranges = doc["parser_config"].get("pages")
+        if not page_ranges:
+            page_ranges = [(1, 100000)]
+        for s, e in page_ranges:
+            s -= 1
+            s = max(0, s)
+            e = min(e - 1, pages)
+            for p in range(s, e, page_size):
+                task = new_task()
+                task["from_page"] = p
+                task["to_page"] = min(p + page_size, e)
+                tsks.append(task)
+    elif doc["parser_id"] == "table":
+        file_bin = MINIO.get(bucket, name)
+        rn = RAGFlowExcelParser.row_number(
+            doc["name"], file_bin)
+        for i in range(0, rn, 3000):
+            task = new_task()
+            task["from_page"] = i
+            task["to_page"] = min(i + 3000, rn)
+            tsks.append(task)
+    else:
+        tsks.append(new_task())
+    for t in tsks:
+        REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t)
+    bulk_insert_into_db(Task, tsks, True)
+    DocumentService.begin2parse(doc["id"])

api/ragflow_server.py CHANGED Viewed

@@ -18,10 +18,14 @@ import logging
 import os
 import signal
 import sys
 import traceback
 from werkzeug.serving import run_simple
 from api.apps import app
 from api.db.runtime_config import RuntimeConfig
 from api.settings import (
     HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
 )
@@ -31,6 +35,16 @@ from api.db.db_models import init_database_tables as init_web_db
 from api.db.init_data import init_web_data
 from api.versions import get_versions
 if __name__ == '__main__':
     print("""
     ____                 ______ __
@@ -71,6 +85,9 @@ if __name__ == '__main__':
     peewee_logger.addHandler(database_logger.handlers[0])
     peewee_logger.setLevel(database_logger.level)
     # start http server
     try:
         stat_logger.info("RAG Flow http server start...")

 import os
 import signal
 import sys
+import time
 import traceback
+from concurrent.futures import ThreadPoolExecutor
 from werkzeug.serving import run_simple
 from api.apps import app
 from api.db.runtime_config import RuntimeConfig
+from api.db.services.document_service import DocumentService
 from api.settings import (
     HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
 )
 from api.db.init_data import init_web_data
 from api.versions import get_versions
+def update_progress():
+    while True:
+        time.sleep(1)
+        try:
+            DocumentService.update_progress()
+        except Exception as e:
+            stat_logger.error("update_progress exception:" + str(e))
 if __name__ == '__main__':
     print("""
     ____                 ______ __
     peewee_logger.addHandler(database_logger.handlers[0])
     peewee_logger.setLevel(database_logger.level)
+    thr = ThreadPoolExecutor(max_workers=1)
+    thr.submit(update_progress)
     # start http server
     try:
         stat_logger.info("RAG Flow http server start...")

conf/service_conf.yaml CHANGED Viewed

@@ -15,6 +15,10 @@ minio:
   host: 'minio:9000'
 es:
   hosts: 'http://es01:9200'
 user_default_llm:
   factory: 'Tongyi-Qianwen'
   api_key: 'sk-xxxxxxxxxxxxx'

   host: 'minio:9000'
 es:
   hosts: 'http://es01:9200'
+redis:
+  db: 1
+  password: 'infini_rag_flow'
+  host: 'redis:6379'
 user_default_llm:
   factory: 'Tongyi-Qianwen'
   api_key: 'sk-xxxxxxxxxxxxx'

docker/.env CHANGED Viewed

@@ -25,6 +25,8 @@ MINIO_PORT=9000
 MINIO_USER=rag_flow
 MINIO_PASSWORD=infini_rag_flow
 SVR_HTTP_PORT=9380
 RAGFLOW_VERSION=latest

 MINIO_USER=rag_flow
 MINIO_PASSWORD=infini_rag_flow
+REDIS_PASSWORD=infini_rag_flow
 SVR_HTTP_PORT=9380
 RAGFLOW_VERSION=latest

docker/README.md CHANGED Viewed

@@ -50,7 +50,7 @@ The serving port of mysql inside the container. The modification should be synch
 The max database connection.
 ### stale_timeout
-The timeout duation in seconds.
 ## minio

 The max database connection.
 ### stale_timeout
+The timeout duration in seconds.
 ## minio

docker/docker-compose-base.yml CHANGED Viewed

@@ -29,24 +29,6 @@ services:
       - ragflow
     restart: always
-  #kibana:
-  #  depends_on:
-  #      es01:
-  #        condition: service_healthy
-  #  image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
-  #  container_name: ragflow-kibana
-  #  volumes:
-  #    - kibanadata:/usr/share/kibana/data
-  #  ports:
-  #    - ${KIBANA_PORT}:5601
-  #  environment:
-  #    - SERVERNAME=kibana
-  #    - ELASTICSEARCH_HOSTS=http://es01:9200
-  #    - TZ=${TIMEZONE}
-  #  mem_limit: ${MEM_LIMIT}
-  #  networks:
-  #    - ragflow
   mysql:
     image: mysql:5.7.18
     container_name: ragflow-mysql
@@ -74,7 +56,6 @@ services:
       retries: 3
     restart: always
   minio:
     image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
     container_name: ragflow-minio
@@ -92,16 +73,27 @@ services:
       - ragflow
     restart: always
 volumes:
   esdata01:
     driver: local
-#  kibanadata:
-#    driver: local
   mysql_data:
     driver: local
   minio_data:
     driver: local
 networks:
   ragflow:

       - ragflow
     restart: always
   mysql:
     image: mysql:5.7.18
     container_name: ragflow-mysql
       retries: 3
     restart: always
   minio:
     image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
     container_name: ragflow-minio
       - ragflow
     restart: always
+  redis:
+    image: redis:7.2.4
+    container_name: ragflow-redis
+    command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory 128mb --maxmemory-policy allkeys-lru
+    volumes:
+      - redis_data:/data
+    networks:
+      - ragflow
+    restart: always
 volumes:
   esdata01:
     driver: local
   mysql_data:
     driver: local
   minio_data:
     driver: local
+  redis_data:
+    driver: local
 networks:
   ragflow:

docker/entrypoint.sh CHANGED Viewed

@@ -12,29 +12,14 @@ function task_exe(){
     done
 }
-function watch_broker(){
-  while [ 1 -eq 1 ];do
-    C=`ps aux|grep "task_broker.py"|grep -v grep|wc -l`;
-    if [ $C -lt 1 ];then
-       $PY rag/svr/task_broker.py &
-    fi
-    sleep 5;
-  done
-}
-function task_bro(){
-    watch_broker;
-}
-task_bro &
 WS=1
 for ((i=0;i<WS;i++))
 do
   task_exe $i $WS &
 done
-while [ 1 -eq 1 ];do
-$PY api/ragflow_server.py
 done
-wait;

     done
 }
 WS=1
 for ((i=0;i<WS;i++))
 do
   task_exe $i $WS &
 done
+while [ 1 -eq q ];do
+    $PY api/ragflow_server.py
 done
+wait;

docker/service_conf.yaml CHANGED Viewed

@@ -15,6 +15,10 @@ minio:
   host: 'minio:9000'
 es:
   hosts: 'http://es01:9200'
 user_default_llm:
   factory: 'Tongyi-Qianwen'
   api_key: 'sk-xxxxxxxxxxxxx'
@@ -34,4 +38,4 @@ authentication:
 permission:
   switch: false
   component: false
-  dataset: false

   host: 'minio:9000'
 es:
   hosts: 'http://es01:9200'
+redis:
+  db: 1
+  password: 'infini_rag_flow'
+  host: 'redis:6379'
 user_default_llm:
   factory: 'Tongyi-Qianwen'
   api_key: 'sk-xxxxxxxxxxxxx'
 permission:
   switch: false
   component: false
+  dataset: false

docs/conversation_api.md CHANGED Viewed

@@ -361,4 +361,4 @@ This is usually used when upload a file to.
     "retmsg": "success"
 }
-```

     "retmsg": "success"
 }
+```

rag/llm/embedding_model.py CHANGED Viewed

@@ -95,8 +95,7 @@ class OpenAIEmbed(Base):
     def encode(self, texts: list, batch_size=32):
         res = self.client.embeddings.create(input=texts,
                                             model=self.model_name)
-        return np.array([d.embedding for d in res.data]
-                        ), res.usage.total_tokens
     def encode_queries(self, text):
         res = self.client.embeddings.create(input=[text],

     def encode(self, texts: list, batch_size=32):
         res = self.client.embeddings.create(input=texts,
                                             model=self.model_name)
+        return np.array([d.embedding for d in res.data]), res.usage.total_tokens
     def encode_queries(self, text):
         res = self.client.embeddings.create(input=[text],

rag/nlp/query.py CHANGED Viewed

@@ -9,12 +9,11 @@ from elasticsearch_dsl import Q
 from rag.nlp import rag_tokenizer, term_weight, synonym
 class EsQueryer:
     def __init__(self, es):
         self.tw = term_weight.Dealer()
         self.es = es
-        self.syn = synonym.Dealer(None)
         self.flds = ["ask_tks^10", "ask_small_tks"]
     @staticmethod

 from rag.nlp import rag_tokenizer, term_weight, synonym
 class EsQueryer:
     def __init__(self, es):
         self.tw = term_weight.Dealer()
         self.es = es
+        self.syn = synonym.Dealer()
         self.flds = ["ask_tks^10", "ask_small_tks"]
     @staticmethod

rag/settings.py CHANGED Viewed

@@ -47,3 +47,9 @@ cron_logger = getLogger("cron_logger")
 cron_logger.setLevel(20)
 chunk_logger = getLogger("chunk_logger")
 database_logger = getLogger("database")

 cron_logger.setLevel(20)
 chunk_logger = getLogger("chunk_logger")
 database_logger = getLogger("database")
+SVR_QUEUE_NAME = "rag_flow_svr_queue"
+SVR_QUEUE_RETENTION = 60*60
+SVR_QUEUE_MAX_LEN = 1024
+SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
+SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"

rag/svr/task_broker.py DELETED Viewed

@@ -1,189 +0,0 @@
-#
-#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import logging
-import os
-import time
-import random
-from datetime import datetime
-from api.db.db_models import Task
-from api.db.db_utils import bulk_insert_into_db
-from api.db.services.file2document_service import File2DocumentService
-from api.db.services.task_service import TaskService
-from deepdoc.parser import PdfParser
-from deepdoc.parser.excel_parser import RAGFlowExcelParser
-from rag.settings import cron_logger
-from rag.utils.minio_conn import MINIO
-from rag.utils import findMaxTm
-import pandas as pd
-from api.db import FileType, TaskStatus
-from api.db.services.document_service import DocumentService
-from api.settings import database_logger
-from api.utils import get_format_time, get_uuid
-from api.utils.file_utils import get_project_base_directory
-from rag.utils.redis_conn import REDIS_CONN
-from api.db.db_models import init_database_tables as init_web_db
-from api.db.init_data import init_web_data
-def collect(tm):
-    docs = DocumentService.get_newly_uploaded(tm)
-    if len(docs) == 0:
-        return pd.DataFrame()
-    docs = pd.DataFrame(docs)
-    mtm = docs["update_time"].max()
-    cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
-    return docs
-def set_dispatching(docid):
-    try:
-        DocumentService.update_by_id(
-            docid, {"progress": random.random() * 1 / 100.,
-                    "progress_msg": "Task dispatched...",
-                    "process_begin_at": get_format_time()
-                    })
-    except Exception as e:
-        cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
-def dispatch():
-    tm_fnm = os.path.join(
-        get_project_base_directory(),
-        "rag/res",
-        f"broker.tm")
-    tm = findMaxTm(tm_fnm)
-    rows = collect(tm)
-    if len(rows) == 0:
-        return
-    tmf = open(tm_fnm, "a+")
-    for _, r in rows.iterrows():
-        try:
-            tsks = TaskService.query(doc_id=r["id"])
-            if tsks:
-                for t in tsks:
-                    TaskService.delete_by_id(t.id)
-        except Exception as e:
-            cron_logger.exception(e)
-        def new_task():
-            nonlocal r
-            return {
-                "id": get_uuid(),
-                "doc_id": r["id"]
-            }
-        tsks = []
-        try:
-            bucket, name = File2DocumentService.get_minio_address(doc_id=r["id"])
-            file_bin = MINIO.get(bucket, name)
-            if r["type"] == FileType.PDF.value:
-                do_layout = r["parser_config"].get("layout_recognize", True)
-                pages = PdfParser.total_page_number(r["name"], file_bin)
-                page_size = r["parser_config"].get("task_page_size", 12)
-                if r["parser_id"] == "paper":
-                    page_size = r["parser_config"].get("task_page_size", 22)
-                if r["parser_id"] == "one":
-                    page_size = 1000000000
-                if not do_layout:
-                    page_size = 1000000000
-                page_ranges = r["parser_config"].get("pages")
-                if not page_ranges:
-                    page_ranges = [(1, 100000)]
-                for s, e in page_ranges:
-                    s -= 1
-                    s = max(0, s)
-                    e = min(e - 1, pages)
-                    for p in range(s, e, page_size):
-                        task = new_task()
-                        task["from_page"] = p
-                        task["to_page"] = min(p + page_size, e)
-                        tsks.append(task)
-            elif r["parser_id"] == "table":
-                rn = RAGFlowExcelParser.row_number(
-                    r["name"], file_bin)
-                for i in range(0, rn, 3000):
-                    task = new_task()
-                    task["from_page"] = i
-                    task["to_page"] = min(i + 3000, rn)
-                    tsks.append(task)
-            else:
-                tsks.append(new_task())
-            bulk_insert_into_db(Task, tsks, True)
-            set_dispatching(r["id"])
-        except Exception as e:
-            cron_logger.exception(e)
-        tmf.write(str(r["update_time"]) + "\n")
-    tmf.close()
-def update_progress():
-    docs = DocumentService.get_unfinished_docs()
-    for d in docs:
-        try:
-            tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
-            if not tsks:
-                continue
-            msg = []
-            prg = 0
-            finished = True
-            bad = 0
-            status = TaskStatus.RUNNING.value
-            for t in tsks:
-                if 0 <= t.progress < 1:
-                    finished = False
-                prg += t.progress if t.progress >= 0 else 0
-                msg.append(t.progress_msg)
-                if t.progress == -1:
-                    bad += 1
-            prg /= len(tsks)
-            if finished and bad:
-                prg = -1
-                status = TaskStatus.FAIL.value
-            elif finished:
-                status = TaskStatus.DONE.value
-            msg = "\n".join(msg)
-            info = {
-                "process_duation": datetime.timestamp(
-                    datetime.now()) -
-                                   d["process_begin_at"].timestamp(),
-                "run": status}
-            if prg != 0:
-                info["progress"] = prg
-            if msg:
-                info["progress_msg"] = msg
-            DocumentService.update_by_id(d["id"], info)
-        except Exception as e:
-            cron_logger.error("fetch task exception:" + str(e))
-if __name__ == "__main__":
-    peewee_logger = logging.getLogger('peewee')
-    peewee_logger.propagate = False
-    peewee_logger.addHandler(database_logger.handlers[0])
-    peewee_logger.setLevel(database_logger.level)
-    # init db
-    init_web_db()
-    init_web_data()
-    while True:
-        dispatch()
-        time.sleep(1)
-        update_progress()

rag/svr/task_executor.py CHANGED Viewed

@@ -28,7 +28,7 @@ from functools import partial
 from api.db.services.file2document_service import File2DocumentService
 from rag.utils.minio_conn import MINIO
 from api.db.db_models import close_connection
-from rag.settings import database_logger
 from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
 from multiprocessing import Pool
 import numpy as np
@@ -93,20 +93,29 @@ def set_progress(task_id, from_page=0, to_page=-1,
         sys.exit()
-def collect(comm, mod, tm):
-    tasks = TaskService.get_tasks(tm, mod, comm)
-    #print(tasks)
-    if len(tasks) == 0:
-        time.sleep(1)
         return pd.DataFrame()
     tasks = pd.DataFrame(tasks)
-    mtm = tasks["update_time"].max()
-    cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
     return tasks
 def get_minio_binary(bucket, name):
-    global MINIO
     return MINIO.get(bucket, name)
@@ -122,13 +131,10 @@ def build(row):
         row["from_page"],
         row["to_page"])
     chunker = FACTORY[row["parser_id"].lower()]
-    pool = Pool(processes=1)
     try:
         st = timer()
         bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
-        thr = pool.apply_async(get_minio_binary, args=(bucket, name))
-        binary = thr.get(timeout=90)
-        pool.terminate()
         cron_logger.info(
             "From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
         cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
@@ -147,7 +153,6 @@ def build(row):
         else:
             callback(-1, f"Internal server error: %s" %
                      str(e).replace("'", ""))
-        pool.terminate()
         traceback.print_exc()
         cron_logger.error(
@@ -238,20 +243,13 @@ def embedding(docs, mdl, parser_config={}, callback=None):
     return tk_count
-def main(comm, mod):
-    tm_fnm = os.path.join(
-        get_project_base_directory(),
-        "rag/res",
-        f"{comm}-{mod}.tm")
-    tm = findMaxTm(tm_fnm)
-    rows = collect(comm, mod, tm)
     if len(rows) == 0:
         return
-    tmf = open(tm_fnm, "a+")
     for _, r in rows.iterrows():
         callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
-        #callback(random.random()/10., "Task has been received.")
         try:
             embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
         except Exception as e:
@@ -265,7 +263,6 @@ def main(comm, mod):
         if cks is None:
             continue
         if not cks:
-            tmf.write(str(r["update_time"]) + "\n")
             callback(1., "No chunk! Done!")
             continue
         # TODO: exception handler
@@ -305,8 +302,6 @@ def main(comm, mod):
                 "Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
                     r["id"], tk_count, len(cks), timer()-st))
-        tmf.write(str(r["update_time"]) + "\n")
-    tmf.close()
 if __name__ == "__main__":
@@ -315,8 +310,6 @@ if __name__ == "__main__":
     peewee_logger.addHandler(database_logger.handlers[0])
     peewee_logger.setLevel(database_logger.level)
-    #from mpi4py import MPI
-    #comm = MPI.COMM_WORLD
     while True:
-        main(int(sys.argv[2]), int(sys.argv[1]))
         close_connection()

 from api.db.services.file2document_service import File2DocumentService
 from rag.utils.minio_conn import MINIO
 from api.db.db_models import close_connection
+from rag.settings import database_logger, SVR_QUEUE_NAME
 from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
 from multiprocessing import Pool
 import numpy as np
         sys.exit()
+def collect():
+    try:
+        payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer")
+        if not payload:
+            time.sleep(1)
+            return pd.DataFrame()
+    except Exception as e:
+        cron_logger.error("Get task event from queue exception:" + str(e))
+        return pd.DataFrame()
+    msg = payload.get_message()
+    payload.ack()
+    if not msg: return pd.DataFrame()
+    if TaskService.do_cancel(msg["id"]):
         return pd.DataFrame()
+    tasks = TaskService.get_tasks(msg["id"])
+    assert tasks, "{} empty task!".format(msg["id"])
     tasks = pd.DataFrame(tasks)
     return tasks
 def get_minio_binary(bucket, name):
     return MINIO.get(bucket, name)
         row["from_page"],
         row["to_page"])
     chunker = FACTORY[row["parser_id"].lower()]
     try:
         st = timer()
         bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
+        binary = get_minio_binary(bucket, name)
         cron_logger.info(
             "From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
         cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
         else:
             callback(-1, f"Internal server error: %s" %
                      str(e).replace("'", ""))
         traceback.print_exc()
         cron_logger.error(
     return tk_count
+def main():
+    rows = collect()
     if len(rows) == 0:
         return
     for _, r in rows.iterrows():
         callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
         try:
             embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
         except Exception as e:
         if cks is None:
             continue
         if not cks:
             callback(1., "No chunk! Done!")
             continue
         # TODO: exception handler
                 "Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
                     r["id"], tk_count, len(cks), timer()-st))
 if __name__ == "__main__":
     peewee_logger.addHandler(database_logger.handlers[0])
     peewee_logger.setLevel(database_logger.level)
     while True:
+        main()
         close_connection()

rag/utils/redis_conn.py CHANGED Viewed

@@ -5,6 +5,27 @@ import logging
 from rag import settings
 from rag.utils import singleton
 @singleton
 class RedisDB:
     def __init__(self):
@@ -17,7 +38,8 @@ class RedisDB:
             self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0],
                                      port=int(self.config.get("host", ":6379").split(":")[1]),
                                      db=int(self.config.get("db", 1)),
-                                     password=self.config["password"])
         except Exception as e:
             logging.warning("Redis can't be connected.")
         return self.REDIS
@@ -70,5 +92,48 @@ class RedisDB:
             self.__open__()
         return False
-REDIS_CONN = RedisDB()

 from rag import settings
 from rag.utils import singleton
+class Payload:
+    def __init__(self, consumer, queue_name, group_name, msg_id, message):
+        self.__consumer = consumer
+        self.__queue_name = queue_name
+        self.__group_name = group_name
+        self.__msg_id = msg_id
+        self.__message = json.loads(message['message'])
+    def ack(self):
+        try:
+            self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id)
+            return True
+        except Exception as e:
+            logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e))
+        return False
+    def get_message(self):
+        return self.__message
 @singleton
 class RedisDB:
     def __init__(self):
             self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0],
                                      port=int(self.config.get("host", ":6379").split(":")[1]),
                                      db=int(self.config.get("db", 1)),
+                                     password=self.config.get("password"),
+                                     decode_responses=True)
         except Exception as e:
             logging.warning("Redis can't be connected.")
         return self.REDIS
             self.__open__()
         return False
+    def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool:
+        try:
+            payload = {"message": json.dumps(message)}
+            pipeline = self.REDIS.pipeline()
+            pipeline.xadd(queue, payload)
+            pipeline.expire(queue, exp)
+            pipeline.execute()
+            return True
+        except Exception as e:
+            logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e))
+        return False
+    def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload:
+        try:
+            group_info = self.REDIS.xinfo_groups(queue_name)
+            if not any(e["name"] == group_name for e in group_info):
+                self.REDIS.xgroup_create(
+                    queue_name,
+                    group_name,
+                    id="$",
+                    mkstream=True
+                )
+            args = {
+                "groupname": group_name,
+                "consumername": consumer_name,
+                "count": 1,
+                "block": 10000,
+                "streams": {queue_name: msg_id},
+            }
+            messages = self.REDIS.xreadgroup(**args)
+            if not messages:
+                return None
+            stream, element_list = messages[0]
+            msg_id, payload = element_list[0]
+            res = Payload(self.REDIS, queue_name, group_name, msg_id, payload)
+            return res
+        except Exception as e:
+            if 'key' in str(e):
+                pass
+            else:
+                logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e))
+        return None
+REDIS_CONN = RedisDB()

requirements.txt CHANGED Viewed

@@ -134,4 +134,4 @@ xxhash==3.4.1
 yarl==1.9.4
 zhipuai==2.0.1
 BCEmbedding
-loguru==0.7.2

 yarl==1.9.4
 zhipuai==2.0.1
 BCEmbedding
+loguru==0.7.2

requirements_dev.txt ADDED Viewed

	@@ -0,0 +1,126 @@

+accelerate==0.27.2
+aiohttp==3.9.3
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.3.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+Aspose.Slides==24.2.0
+attrs==23.2.0
+blinker==1.7.0
+cachelib==0.12.0
+cachetools==5.3.3
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+coloredlogs==15.0.1
+cryptography==42.0.5
+dashscope==1.14.1
+datasets==2.17.1
+datrie==0.8.2
+demjson3==3.0.6
+dill==0.3.8
+distro==1.9.0
+elastic-transport==8.12.0
+elasticsearch==8.12.1
+elasticsearch-dsl==8.12.0
+et-xmlfile==1.1.0
+filelock==3.13.1
+fastembed==0.2.6
+FlagEmbedding==1.2.5
+Flask==3.0.2
+Flask-Cors==4.0.0
+Flask-Login==0.6.3
+Flask-Session==0.6.0
+flatbuffers==23.5.26
+frozenlist==1.4.1
+fsspec==2023.10.0
+h11==0.14.0
+hanziconv==0.3.2
+httpcore==1.0.4
+httpx==0.27.0
+huggingface-hub==0.20.3
+humanfriendly==10.0
+idna==3.6
+install==1.3.5
+itsdangerous==2.1.2
+Jinja2==3.1.3
+joblib==1.3.2
+lxml==5.1.0
+MarkupSafe==2.1.5
+minio==7.2.4
+mpi4py==3.1.5
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.4
+openai==1.12.0
+opencv-python==4.9.0.80
+openpyxl==3.1.2
+packaging==23.2
+pandas==2.2.1
+pdfminer.six==20221105
+pdfplumber==0.10.4
+peewee==3.17.1
+pillow==10.2.0
+protobuf==4.25.3
+psutil==5.9.8
+pyarrow==15.0.0
+pyarrow-hotfix==0.6
+pyclipper==1.3.0.post5
+pycparser==2.21
+pycryptodome==3.20.0
+pycryptodome-test-vectors==1.0.14
+pycryptodomex==3.20.0
+pydantic==2.6.2
+pydantic_core==2.16.3
+PyJWT==2.8.0
+PyMuPDF==1.23.25
+PyMuPDFb==1.23.22
+PyMySQL==1.1.0
+PyPDF2==3.0.1
+pypdfium2==4.27.0
+python-dateutil==2.8.2
+python-docx==1.1.0
+python-dotenv==1.0.1
+python-pptx==0.6.23
+pytz==2024.1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+safetensors==0.4.2
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+sentence-transformers==2.4.0
+shapely==2.0.3
+six==1.16.0
+sniffio==1.3.1
+StrEnum==0.4.15
+sympy==1.12
+threadpoolctl==3.3.0
+tika==2.6.0
+tiktoken==0.6.0
+tokenizers==0.15.2
+torch==2.2.1
+tqdm==4.66.2
+transformers==4.38.1
+triton==2.2.0
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+Werkzeug==3.0.1
+xgboost==2.0.3
+XlsxWriter==3.2.0
+xpinyin==0.7.6
+xxhash==3.4.1
+yarl==1.9.4
+zhipuai==2.0.1
+BCEmbedding
+loguru==0.7.2
+ollama==0.1.8
+redis==5.0.4