Try to reuse existing chunks (#3983)
Browse files### What problem does this PR solve?
Try to reuse existing chunks. Close #3793
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/document_app.py +18 -10
 - api/db/db_models.py +14 -0
 - api/db/services/document_service.py +25 -0
 - api/db/services/task_service.py +86 -14
 - poetry.lock +20 -30
 - pyproject.toml +1 -0
 - rag/svr/task_executor.py +76 -29
 
    	
        api/apps/document_app.py
    CHANGED
    
    | 
         @@ -21,19 +21,25 @@ import flask 
     | 
|
| 21 | 
         
             
            from flask import request
         
     | 
| 22 | 
         
             
            from flask_login import login_required, current_user
         
     | 
| 23 | 
         | 
| 24 | 
         
            -
            from  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 25 | 
         
             
            from api.db.services.file2document_service import File2DocumentService
         
     | 
| 26 | 
         
             
            from api.db.services.file_service import FileService
         
     | 
| 27 | 
         
            -
            from api.db.services.task_service import  
     | 
| 28 | 
         
             
            from api.db.services.user_service import UserTenantService
         
     | 
| 29 | 
         
            -
            from deepdoc.parser.html_parser import RAGFlowHtmlParser
         
     | 
| 30 | 
         
            -
            from rag.nlp import search
         
     | 
| 31 | 
         
             
            from api.db.services import duplicate_name
         
     | 
| 32 | 
         
             
            from api.db.services.knowledgebase_service import KnowledgebaseService
         
     | 
| 33 | 
         
            -
            from api. 
     | 
| 34 | 
         
            -
            from api.utils import get_uuid
         
     | 
| 35 | 
         
            -
            from api.db import FileType, TaskStatus, ParserType, FileSource
         
     | 
| 36 | 
         
             
            from api.db.services.document_service import DocumentService, doc_upload_and_parse
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 37 | 
         
             
            from api import settings
         
     | 
| 38 | 
         
             
            from api.utils.api_utils import get_json_result
         
     | 
| 39 | 
         
             
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 
         @@ -316,6 +322,7 @@ def rm(): 
     | 
|
| 316 | 
         | 
| 317 | 
         
             
                        b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
         
     | 
| 318 | 
         | 
| 
         | 
|
| 319 | 
         
             
                        if not DocumentService.remove_document(doc, tenant_id):
         
     | 
| 320 | 
         
             
                            return get_data_error_result(
         
     | 
| 321 | 
         
             
                                message="Database error (Document removal)!")
         
     | 
| 
         @@ -361,11 +368,12 @@ def run(): 
     | 
|
| 361 | 
         
             
                        e, doc = DocumentService.get_by_id(id)
         
     | 
| 362 | 
         
             
                        if not e:
         
     | 
| 363 | 
         
             
                            return get_data_error_result(message="Document not found!")
         
     | 
| 364 | 
         
            -
                        if  
     | 
| 365 | 
         
            -
                             
     | 
| 
         | 
|
| 
         | 
|
| 366 | 
         | 
| 367 | 
         
             
                        if str(req["run"]) == TaskStatus.RUNNING.value:
         
     | 
| 368 | 
         
            -
                            TaskService.filter_delete([Task.doc_id == id])
         
     | 
| 369 | 
         
             
                            e, doc = DocumentService.get_by_id(id)
         
     | 
| 370 | 
         
             
                            doc = doc.to_dict()
         
     | 
| 371 | 
         
             
                            doc["tenant_id"] = tenant_id
         
     | 
| 
         | 
|
| 21 | 
         
             
            from flask import request
         
     | 
| 22 | 
         
             
            from flask_login import login_required, current_user
         
     | 
| 23 | 
         | 
| 24 | 
         
            +
            from deepdoc.parser.html_parser import RAGFlowHtmlParser
         
     | 
| 25 | 
         
            +
            from rag.nlp import search
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            from api.db import FileType, TaskStatus, ParserType, FileSource
         
     | 
| 28 | 
         
            +
            from api.db.db_models import File, Task
         
     | 
| 29 | 
         
             
            from api.db.services.file2document_service import File2DocumentService
         
     | 
| 30 | 
         
             
            from api.db.services.file_service import FileService
         
     | 
| 31 | 
         
            +
            from api.db.services.task_service import queue_tasks
         
     | 
| 32 | 
         
             
            from api.db.services.user_service import UserTenantService
         
     | 
| 
         | 
|
| 
         | 
|
| 33 | 
         
             
            from api.db.services import duplicate_name
         
     | 
| 34 | 
         
             
            from api.db.services.knowledgebase_service import KnowledgebaseService
         
     | 
| 35 | 
         
            +
            from api.db.services.task_service import TaskService
         
     | 
| 
         | 
|
| 
         | 
|
| 36 | 
         
             
            from api.db.services.document_service import DocumentService, doc_upload_and_parse
         
     | 
| 37 | 
         
            +
            from api.utils.api_utils import (
         
     | 
| 38 | 
         
            +
                server_error_response,
         
     | 
| 39 | 
         
            +
                get_data_error_result,
         
     | 
| 40 | 
         
            +
                validate_request,
         
     | 
| 41 | 
         
            +
            )
         
     | 
| 42 | 
         
            +
            from api.utils import get_uuid
         
     | 
| 43 | 
         
             
            from api import settings
         
     | 
| 44 | 
         
             
            from api.utils.api_utils import get_json_result
         
     | 
| 45 | 
         
             
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 
         | 
|
| 322 | 
         | 
| 323 | 
         
             
                        b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
         
     | 
| 324 | 
         | 
| 325 | 
         
            +
                        TaskService.filter_delete([Task.doc_id == doc_id])
         
     | 
| 326 | 
         
             
                        if not DocumentService.remove_document(doc, tenant_id):
         
     | 
| 327 | 
         
             
                            return get_data_error_result(
         
     | 
| 328 | 
         
             
                                message="Database error (Document removal)!")
         
     | 
| 
         | 
|
| 368 | 
         
             
                        e, doc = DocumentService.get_by_id(id)
         
     | 
| 369 | 
         
             
                        if not e:
         
     | 
| 370 | 
         
             
                            return get_data_error_result(message="Document not found!")
         
     | 
| 371 | 
         
            +
                        if req.get("delete", False):
         
     | 
| 372 | 
         
            +
                            TaskService.filter_delete([Task.doc_id == id])
         
     | 
| 373 | 
         
            +
                            if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
         
     | 
| 374 | 
         
            +
                                settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
         
     | 
| 375 | 
         | 
| 376 | 
         
             
                        if str(req["run"]) == TaskStatus.RUNNING.value:
         
     | 
| 
         | 
|
| 377 | 
         
             
                            e, doc = DocumentService.get_by_id(id)
         
     | 
| 378 | 
         
             
                            doc = doc.to_dict()
         
     | 
| 379 | 
         
             
                            doc["tenant_id"] = tenant_id
         
     | 
    	
        api/db/db_models.py
    CHANGED
    
    | 
         @@ -855,6 +855,8 @@ class Task(DataBaseModel): 
     | 
|
| 855 | 
         
             
                    help_text="process message",
         
     | 
| 856 | 
         
             
                    default="")
         
     | 
| 857 | 
         
             
                retry_count = IntegerField(default=0)
         
     | 
| 
         | 
|
| 
         | 
|
| 858 | 
         | 
| 859 | 
         | 
| 860 | 
         
             
            class Dialog(DataBaseModel):
         
     | 
| 
         @@ -1090,4 +1092,16 @@ def migrate_db(): 
     | 
|
| 1090 | 
         
             
                        )
         
     | 
| 1091 | 
         
             
                    except Exception:
         
     | 
| 1092 | 
         
             
                        pass
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1093 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 855 | 
         
             
                    help_text="process message",
         
     | 
| 856 | 
         
             
                    default="")
         
     | 
| 857 | 
         
             
                retry_count = IntegerField(default=0)
         
     | 
| 858 | 
         
            +
                digest = TextField(null=True, help_text="task digest", default="")
         
     | 
| 859 | 
         
            +
                chunk_ids = LongTextField(null=True, help_text="chunk ids", default="")
         
     | 
| 860 | 
         | 
| 861 | 
         | 
| 862 | 
         
             
            class Dialog(DataBaseModel):
         
     | 
| 
         | 
|
| 1092 | 
         
             
                        )
         
     | 
| 1093 | 
         
             
                    except Exception:
         
     | 
| 1094 | 
         
             
                        pass
         
     | 
| 1095 | 
         
            +
                    try:
         
     | 
| 1096 | 
         
            +
                        migrate(
         
     | 
| 1097 | 
         
            +
                            migrator.add_column("task", "digest", TextField(null=True, help_text="task digest", default=""))
         
     | 
| 1098 | 
         
            +
                        )
         
     | 
| 1099 | 
         
            +
                    except Exception:
         
     | 
| 1100 | 
         
            +
                        pass
         
     | 
| 1101 | 
         | 
| 1102 | 
         
            +
                    try:
         
     | 
| 1103 | 
         
            +
                        migrate(
         
     | 
| 1104 | 
         
            +
                            migrator.add_column("task", "chunk_ids", LongTextField(null=True, help_text="chunk ids", default=""))
         
     | 
| 1105 | 
         
            +
                        )
         
     | 
| 1106 | 
         
            +
                    except Exception:
         
     | 
| 1107 | 
         
            +
                        pass
         
     | 
    	
        api/db/services/document_service.py
    CHANGED
    
    | 
         @@ -282,6 +282,31 @@ class DocumentService(CommonService): 
     | 
|
| 282 | 
         
             
                        return
         
     | 
| 283 | 
         
             
                    return docs[0]["embd_id"]
         
     | 
| 284 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 285 | 
         
             
                @classmethod
         
     | 
| 286 | 
         
             
                @DB.connection_context()
         
     | 
| 287 | 
         
             
                def get_doc_id_by_doc_name(cls, doc_name):
         
     | 
| 
         | 
|
| 282 | 
         
             
                        return
         
     | 
| 283 | 
         
             
                    return docs[0]["embd_id"]
         
     | 
| 284 | 
         | 
| 285 | 
         
            +
                @classmethod
         
     | 
| 286 | 
         
            +
                @DB.connection_context()
         
     | 
| 287 | 
         
            +
                def get_chunking_config(cls, doc_id):
         
     | 
| 288 | 
         
            +
                    configs = (
         
     | 
| 289 | 
         
            +
                        cls.model.select(
         
     | 
| 290 | 
         
            +
                            cls.model.id,
         
     | 
| 291 | 
         
            +
                            cls.model.kb_id,
         
     | 
| 292 | 
         
            +
                            cls.model.parser_id,
         
     | 
| 293 | 
         
            +
                            cls.model.parser_config,
         
     | 
| 294 | 
         
            +
                            Knowledgebase.language,
         
     | 
| 295 | 
         
            +
                            Knowledgebase.embd_id,
         
     | 
| 296 | 
         
            +
                            Tenant.id.alias("tenant_id"),
         
     | 
| 297 | 
         
            +
                            Tenant.img2txt_id,
         
     | 
| 298 | 
         
            +
                            Tenant.asr_id,
         
     | 
| 299 | 
         
            +
                            Tenant.llm_id,
         
     | 
| 300 | 
         
            +
                        )
         
     | 
| 301 | 
         
            +
                        .join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
         
     | 
| 302 | 
         
            +
                        .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
         
     | 
| 303 | 
         
            +
                        .where(cls.model.id == doc_id)
         
     | 
| 304 | 
         
            +
                    )
         
     | 
| 305 | 
         
            +
                    configs = configs.dicts()
         
     | 
| 306 | 
         
            +
                    if not configs:
         
     | 
| 307 | 
         
            +
                        return None
         
     | 
| 308 | 
         
            +
                    return configs[0]
         
     | 
| 309 | 
         
            +
             
     | 
| 310 | 
         
             
                @classmethod
         
     | 
| 311 | 
         
             
                @DB.connection_context()
         
     | 
| 312 | 
         
             
                def get_doc_id_by_doc_name(cls, doc_name):
         
     | 
    	
        api/db/services/task_service.py
    CHANGED
    
    | 
         @@ -15,6 +15,8 @@ 
     | 
|
| 15 | 
         
             
            #
         
     | 
| 16 | 
         
             
            import os
         
     | 
| 17 | 
         
             
            import random
         
     | 
| 
         | 
|
| 
         | 
|
| 18 | 
         | 
| 19 | 
         
             
            from api.db.db_utils import bulk_insert_into_db
         
     | 
| 20 | 
         
             
            from deepdoc.parser import PdfParser
         
     | 
| 
         @@ -29,7 +31,21 @@ from deepdoc.parser.excel_parser import RAGFlowExcelParser 
     | 
|
| 29 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 30 | 
         
             
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 31 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 
         | 
|
| 
         | 
|
| 32 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 33 | 
         | 
| 34 | 
         
             
            class TaskService(CommonService):
         
     | 
| 35 | 
         
             
                model = Task
         
     | 
| 
         @@ -87,6 +103,30 @@ class TaskService(CommonService): 
     | 
|
| 87 | 
         | 
| 88 | 
         
             
                    return docs[0]
         
     | 
| 89 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 90 | 
         
             
                @classmethod
         
     | 
| 91 | 
         
             
                @DB.connection_context()
         
     | 
| 92 | 
         
             
                def get_ongoing_doc_name(cls):
         
     | 
| 
         @@ -133,22 +173,18 @@ class TaskService(CommonService): 
     | 
|
| 133 | 
         
             
                @classmethod
         
     | 
| 134 | 
         
             
                @DB.connection_context()
         
     | 
| 135 | 
         
             
                def do_cancel(cls, id):
         
     | 
| 136 | 
         
            -
                     
     | 
| 137 | 
         
            -
             
     | 
| 138 | 
         
            -
             
     | 
| 139 | 
         
            -
                        return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
         
     | 
| 140 | 
         
            -
                    except Exception:
         
     | 
| 141 | 
         
            -
                        pass
         
     | 
| 142 | 
         
            -
                    return False
         
     | 
| 143 | 
         | 
| 144 | 
         
             
                @classmethod
         
     | 
| 145 | 
         
             
                @DB.connection_context()
         
     | 
| 146 | 
         
             
                def update_progress(cls, id, info):
         
     | 
| 147 | 
         
             
                    if os.environ.get("MACOS"):
         
     | 
| 148 | 
         
             
                        if info["progress_msg"]:
         
     | 
| 149 | 
         
            -
                            cls.model. 
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
                            ).where(cls.model.id == id).execute()
         
     | 
| 152 | 
         
             
                        if "progress" in info:
         
     | 
| 153 | 
         
             
                            cls.model.update(progress=info["progress"]).where(
         
     | 
| 154 | 
         
             
                                cls.model.id == id
         
     | 
| 
         @@ -157,9 +193,9 @@ class TaskService(CommonService): 
     | 
|
| 157 | 
         | 
| 158 | 
         
             
                    with DB.lock("update_progress", -1):
         
     | 
| 159 | 
         
             
                        if info["progress_msg"]:
         
     | 
| 160 | 
         
            -
                            cls.model. 
     | 
| 161 | 
         
            -
             
     | 
| 162 | 
         
            -
                            ).where(cls.model.id == id).execute()
         
     | 
| 163 | 
         
             
                        if "progress" in info:
         
     | 
| 164 | 
         
             
                            cls.model.update(progress=info["progress"]).where(
         
     | 
| 165 | 
         
             
                                cls.model.id == id
         
     | 
| 
         @@ -168,7 +204,7 @@ class TaskService(CommonService): 
     | 
|
| 168 | 
         | 
| 169 | 
         
             
            def queue_tasks(doc: dict, bucket: str, name: str):
         
     | 
| 170 | 
         
             
                def new_task():
         
     | 
| 171 | 
         
            -
                    return {"id": get_uuid(), "doc_id": doc["id"]}
         
     | 
| 172 | 
         | 
| 173 | 
         
             
                tsks = []
         
     | 
| 174 | 
         | 
| 
         @@ -203,10 +239,46 @@ def queue_tasks(doc: dict, bucket: str, name: str): 
     | 
|
| 203 | 
         
             
                else:
         
     | 
| 204 | 
         
             
                    tsks.append(new_task())
         
     | 
| 205 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 206 | 
         
             
                bulk_insert_into_db(Task, tsks, True)
         
     | 
| 207 | 
         
             
                DocumentService.begin2parse(doc["id"])
         
     | 
| 208 | 
         | 
| 
         | 
|
| 209 | 
         
             
                for t in tsks:
         
     | 
| 210 | 
         
             
                    assert REDIS_CONN.queue_product(
         
     | 
| 211 | 
         
             
                        SVR_QUEUE_NAME, message=t
         
     | 
| 212 | 
         
             
                    ), "Can't access Redis. Please check the Redis' status."
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 15 | 
         
             
            #
         
     | 
| 16 | 
         
             
            import os
         
     | 
| 17 | 
         
             
            import random
         
     | 
| 18 | 
         
            +
            import xxhash
         
     | 
| 19 | 
         
            +
            import bisect
         
     | 
| 20 | 
         | 
| 21 | 
         
             
            from api.db.db_utils import bulk_insert_into_db
         
     | 
| 22 | 
         
             
            from deepdoc.parser import PdfParser
         
     | 
| 
         | 
|
| 31 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 32 | 
         
             
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 33 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 34 | 
         
            +
            from api import settings
         
     | 
| 35 | 
         
            +
            from rag.nlp import search
         
     | 
| 36 | 
         | 
| 37 | 
         
            +
            def trim_header_by_lines(text: str, max_length) -> str:
         
     | 
| 38 | 
         
            +
                if len(text) <= max_length:
         
     | 
| 39 | 
         
            +
                    return text
         
     | 
| 40 | 
         
            +
                lines = text.split("\n")
         
     | 
| 41 | 
         
            +
                total = 0
         
     | 
| 42 | 
         
            +
                idx = len(lines) - 1
         
     | 
| 43 | 
         
            +
                for i in range(len(lines)-1, -1, -1):
         
     | 
| 44 | 
         
            +
                    if total + len(lines[i]) > max_length:
         
     | 
| 45 | 
         
            +
                        break
         
     | 
| 46 | 
         
            +
                    idx = i
         
     | 
| 47 | 
         
            +
                text2 = "\n".join(lines[idx:])
         
     | 
| 48 | 
         
            +
                return text2
         
     | 
| 49 | 
         | 
| 50 | 
         
             
            class TaskService(CommonService):
         
     | 
| 51 | 
         
             
                model = Task
         
     | 
| 
         | 
|
| 103 | 
         | 
| 104 | 
         
             
                    return docs[0]
         
     | 
| 105 | 
         | 
| 106 | 
         
            +
                @classmethod
         
     | 
| 107 | 
         
            +
                @DB.connection_context()
         
     | 
| 108 | 
         
            +
                def get_tasks(cls, doc_id: str):
         
     | 
| 109 | 
         
            +
                    fields = [
         
     | 
| 110 | 
         
            +
                        cls.model.id,
         
     | 
| 111 | 
         
            +
                        cls.model.from_page,
         
     | 
| 112 | 
         
            +
                        cls.model.progress,
         
     | 
| 113 | 
         
            +
                        cls.model.digest,
         
     | 
| 114 | 
         
            +
                        cls.model.chunk_ids,
         
     | 
| 115 | 
         
            +
                    ]
         
     | 
| 116 | 
         
            +
                    tasks = (
         
     | 
| 117 | 
         
            +
                        cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc())
         
     | 
| 118 | 
         
            +
                        .where(cls.model.doc_id == doc_id)
         
     | 
| 119 | 
         
            +
                    )
         
     | 
| 120 | 
         
            +
                    tasks = list(tasks.dicts())
         
     | 
| 121 | 
         
            +
                    if not tasks:
         
     | 
| 122 | 
         
            +
                        return None
         
     | 
| 123 | 
         
            +
                    return tasks
         
     | 
| 124 | 
         
            +
             
     | 
| 125 | 
         
            +
                @classmethod
         
     | 
| 126 | 
         
            +
                @DB.connection_context()
         
     | 
| 127 | 
         
            +
                def update_chunk_ids(cls, id: str, chunk_ids: str):
         
     | 
| 128 | 
         
            +
                    cls.model.update(chunk_ids=chunk_ids).where(cls.model.id == id).execute()
         
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
             
                @classmethod
         
     | 
| 131 | 
         
             
                @DB.connection_context()
         
     | 
| 132 | 
         
             
                def get_ongoing_doc_name(cls):
         
     | 
| 
         | 
|
| 173 | 
         
             
                @classmethod
         
     | 
| 174 | 
         
             
                @DB.connection_context()
         
     | 
| 175 | 
         
             
                def do_cancel(cls, id):
         
     | 
| 176 | 
         
            +
                    task = cls.model.get_by_id(id)
         
     | 
| 177 | 
         
            +
                    _, doc = DocumentService.get_by_id(task.doc_id)
         
     | 
| 178 | 
         
            +
                    return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 179 | 
         | 
| 180 | 
         
             
                @classmethod
         
     | 
| 181 | 
         
             
                @DB.connection_context()
         
     | 
| 182 | 
         
             
                def update_progress(cls, id, info):
         
     | 
| 183 | 
         
             
                    if os.environ.get("MACOS"):
         
     | 
| 184 | 
         
             
                        if info["progress_msg"]:
         
     | 
| 185 | 
         
            +
                            task = cls.model.get_by_id(id)
         
     | 
| 186 | 
         
            +
                            progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
         
     | 
| 187 | 
         
            +
                            cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
         
     | 
| 188 | 
         
             
                        if "progress" in info:
         
     | 
| 189 | 
         
             
                            cls.model.update(progress=info["progress"]).where(
         
     | 
| 190 | 
         
             
                                cls.model.id == id
         
     | 
| 
         | 
|
| 193 | 
         | 
| 194 | 
         
             
                    with DB.lock("update_progress", -1):
         
     | 
| 195 | 
         
             
                        if info["progress_msg"]:
         
     | 
| 196 | 
         
            +
                            task = cls.model.get_by_id(id)
         
     | 
| 197 | 
         
            +
                            progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
         
     | 
| 198 | 
         
            +
                            cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
         
     | 
| 199 | 
         
             
                        if "progress" in info:
         
     | 
| 200 | 
         
             
                            cls.model.update(progress=info["progress"]).where(
         
     | 
| 201 | 
         
             
                                cls.model.id == id
         
     | 
| 
         | 
|
| 204 | 
         | 
| 205 | 
         
             
            def queue_tasks(doc: dict, bucket: str, name: str):
         
     | 
| 206 | 
         
             
                def new_task():
         
     | 
| 207 | 
         
            +
                    return {"id": get_uuid(), "doc_id": doc["id"], "progress": 0.0}
         
     | 
| 208 | 
         | 
| 209 | 
         
             
                tsks = []
         
     | 
| 210 | 
         | 
| 
         | 
|
| 239 | 
         
             
                else:
         
     | 
| 240 | 
         
             
                    tsks.append(new_task())
         
     | 
| 241 | 
         | 
| 242 | 
         
            +
                chunking_config = DocumentService.get_chunking_config(doc["id"])
         
     | 
| 243 | 
         
            +
                for task in tsks:
         
     | 
| 244 | 
         
            +
                    hasher = xxhash.xxh64()
         
     | 
| 245 | 
         
            +
                    for field in sorted(chunking_config.keys()):
         
     | 
| 246 | 
         
            +
                        hasher.update(str(chunking_config[field]).encode("utf-8"))
         
     | 
| 247 | 
         
            +
                    for field in ["doc_id", "from_page", "to_page"]:
         
     | 
| 248 | 
         
            +
                        hasher.update(str(task.get(field, "")).encode("utf-8"))
         
     | 
| 249 | 
         
            +
                    task_digest = hasher.hexdigest()
         
     | 
| 250 | 
         
            +
                    task["digest"] = task_digest
         
     | 
| 251 | 
         
            +
                    task["progress"] = 0.0
         
     | 
| 252 | 
         
            +
             
     | 
| 253 | 
         
            +
                prev_tasks = TaskService.get_tasks(doc["id"])
         
     | 
| 254 | 
         
            +
                if prev_tasks:
         
     | 
| 255 | 
         
            +
                    for task in tsks:
         
     | 
| 256 | 
         
            +
                        reuse_prev_task_chunks(task, prev_tasks, chunking_config)
         
     | 
| 257 | 
         
            +
                    TaskService.filter_delete([Task.doc_id == doc["id"]])
         
     | 
| 258 | 
         
            +
                    chunk_ids = []
         
     | 
| 259 | 
         
            +
                    for task in prev_tasks:
         
     | 
| 260 | 
         
            +
                        if task["chunk_ids"]:
         
     | 
| 261 | 
         
            +
                            chunk_ids.extend(task["chunk_ids"].split())
         
     | 
| 262 | 
         
            +
                    if chunk_ids:
         
     | 
| 263 | 
         
            +
                        settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
         
     | 
| 264 | 
         
            +
             
     | 
| 265 | 
         
             
                bulk_insert_into_db(Task, tsks, True)
         
     | 
| 266 | 
         
             
                DocumentService.begin2parse(doc["id"])
         
     | 
| 267 | 
         | 
| 268 | 
         
            +
                tsks = [task for task in tsks if task["progress"] < 1.0]
         
     | 
| 269 | 
         
             
                for t in tsks:
         
     | 
| 270 | 
         
             
                    assert REDIS_CONN.queue_product(
         
     | 
| 271 | 
         
             
                        SVR_QUEUE_NAME, message=t
         
     | 
| 272 | 
         
             
                    ), "Can't access Redis. Please check the Redis' status."
         
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
            def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
         
     | 
| 275 | 
         
            +
                idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
         
     | 
| 276 | 
         
            +
                if idx >= len(prev_tasks):
         
     | 
| 277 | 
         
            +
                    return
         
     | 
| 278 | 
         
            +
                prev_task = prev_tasks[idx]
         
     | 
| 279 | 
         
            +
                if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
         
     | 
| 280 | 
         
            +
                    return
         
     | 
| 281 | 
         
            +
                task["chunk_ids"] = prev_task["chunk_ids"]
         
     | 
| 282 | 
         
            +
                task["progress"] = 1.0
         
     | 
| 283 | 
         
            +
                task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
         
     | 
| 284 | 
         
            +
                prev_task["chunk_ids"] = ""
         
     | 
    	
        poetry.lock
    CHANGED
    
    | 
         @@ -145,7 +145,7 @@ name = "aiolimiter" 
     | 
|
| 145 | 
         
             
            version = "1.2.0"
         
     | 
| 146 | 
         
             
            description = "asyncio rate limiter, a leaky bucket implementation"
         
     | 
| 147 | 
         
             
            optional = false
         
     | 
| 148 | 
         
            -
            python-versions = " 
     | 
| 149 | 
         
             
            files = [
         
     | 
| 150 | 
         
             
                {file = "aiolimiter-1.2.0-py3-none-any.whl", hash = "sha256:e3fc486a4506248cfdd1f3976920459945944518bbb1d1e6b2be1060232829e2"},
         
     | 
| 151 | 
         
             
                {file = "aiolimiter-1.2.0.tar.gz", hash = "sha256:761455d26df0d7a393f78bd39b022579e02ca5a65beb303a67bed2ded2f740ac"},
         
     | 
| 
         @@ -416,7 +416,7 @@ name = "aspose-slides" 
     | 
|
| 416 | 
         
             
            version = "24.12.0"
         
     | 
| 417 | 
         
             
            description = "Aspose.Slides for Python via .NET is a presentation file formats processing library for working with Microsoft PowerPoint files without using Microsoft PowerPoint."
         
     | 
| 418 | 
         
             
            optional = false
         
     | 
| 419 | 
         
            -
            python-versions = " 
     | 
| 420 | 
         
             
            files = [
         
     | 
| 421 | 
         
             
                {file = "Aspose.Slides-24.12.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ccfaa61a863ed28cd37b221e31a0edf4a83802599d76fb50861c25149ac5e5e3"},
         
     | 
| 422 | 
         
             
                {file = "Aspose.Slides-24.12.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b050659129c5ca92e52fbcd7d5091caa244db731adb68fbea1fd0a8b9fd62a5a"},
         
     | 
| 
         @@ -568,7 +568,7 @@ name = "bce-python-sdk" 
     | 
|
| 568 | 
         
             
            version = "0.9.23"
         
     | 
| 569 | 
         
             
            description = "BCE SDK for python"
         
     | 
| 570 | 
         
             
            optional = false
         
     | 
| 571 | 
         
            -
            python-versions = "!=3.0 
     | 
| 572 | 
         
             
            files = [
         
     | 
| 573 | 
         
             
                {file = "bce_python_sdk-0.9.23-py3-none-any.whl", hash = "sha256:8debe21a040e00060f6044877d594765ed7b18bc765c6bf16b878bca864140a3"},
         
     | 
| 574 | 
         
             
                {file = "bce_python_sdk-0.9.23.tar.gz", hash = "sha256:19739fed5cd0725356fc5ffa2acbdd8fb23f2a81edb91db21a03174551d0cf41"},
         
     | 
| 
         @@ -1502,7 +1502,7 @@ name = "cryptography" 
     | 
|
| 1502 | 
         
             
            version = "44.0.0"
         
     | 
| 1503 | 
         
             
            description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
         
     | 
| 1504 | 
         
             
            optional = false
         
     | 
| 1505 | 
         
            -
            python-versions = "!=3.9.0 
     | 
| 1506 | 
         
             
            files = [
         
     | 
| 1507 | 
         
             
                {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"},
         
     | 
| 1508 | 
         
             
                {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"},
         
     | 
| 
         @@ -1711,7 +1711,7 @@ name = "deprecated" 
     | 
|
| 1711 | 
         
             
            version = "1.2.15"
         
     | 
| 1712 | 
         
             
            description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
         
     | 
| 1713 | 
         
             
            optional = false
         
     | 
| 1714 | 
         
            -
            python-versions = "!=3.0 
     | 
| 1715 | 
         
             
            files = [
         
     | 
| 1716 | 
         
             
                {file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"},
         
     | 
| 1717 | 
         
             
                {file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"},
         
     | 
| 
         @@ -2042,7 +2042,7 @@ name = "fastembed" 
     | 
|
| 2042 | 
         
             
            version = "0.3.6"
         
     | 
| 2043 | 
         
             
            description = "Fast, light, accurate library built for retrieval embedding generation"
         
     | 
| 2044 | 
         
             
            optional = false
         
     | 
| 2045 | 
         
            -
            python-versions = " 
     | 
| 2046 | 
         
             
            files = [
         
     | 
| 2047 | 
         
             
                {file = "fastembed-0.3.6-py3-none-any.whl", hash = "sha256:2bf70edae28bb4ccd9e01617098c2075b0ba35b88025a3d22b0e1e85b2c488ce"},
         
     | 
| 2048 | 
         
             
                {file = "fastembed-0.3.6.tar.gz", hash = "sha256:c93c8ec99b8c008c2d192d6297866b8d70ec7ac8f5696b34eb5ea91f85efd15f"},
         
     | 
| 
         @@ -2624,12 +2624,12 @@ files = [ 
     | 
|
| 2624 | 
         
             
            google-auth = ">=2.14.1,<3.0.dev0"
         
     | 
| 2625 | 
         
             
            googleapis-common-protos = ">=1.56.2,<2.0.dev0"
         
     | 
| 2626 | 
         
             
            grpcio = [
         
     | 
| 2627 | 
         
            -
                {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
         
     | 
| 2628 | 
         
             
                {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
         
     | 
| 
         | 
|
| 2629 | 
         
             
            ]
         
     | 
| 2630 | 
         
             
            grpcio-status = [
         
     | 
| 2631 | 
         
            -
                {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
         
     | 
| 2632 | 
         
             
                {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
         
     | 
| 
         | 
|
| 2633 | 
         
             
            ]
         
     | 
| 2634 | 
         
             
            proto-plus = ">=1.22.3,<2.0.0dev"
         
     | 
| 2635 | 
         
             
            protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
         
     | 
| 
         @@ -2965,7 +2965,7 @@ name = "graspologic" 
     | 
|
| 2965 | 
         
             
            version = "3.4.1"
         
     | 
| 2966 | 
         
             
            description = "A set of Python modules for graph statistics"
         
     | 
| 2967 | 
         
             
            optional = false
         
     | 
| 2968 | 
         
            -
            python-versions = " 
     | 
| 2969 | 
         
             
            files = [
         
     | 
| 2970 | 
         
             
                {file = "graspologic-3.4.1-py3-none-any.whl", hash = "sha256:c6563e087eda599bad1de831d4b7321c0daa7a82f4e85a7d7737ff67e07cdda2"},
         
     | 
| 2971 | 
         
             
                {file = "graspologic-3.4.1.tar.gz", hash = "sha256:7561f0b852a2bccd351bff77e8db07d9892f9dfa35a420fdec01690e4fdc8075"},
         
     | 
| 
         @@ -3650,7 +3650,7 @@ name = "infinity-emb" 
     | 
|
| 3650 | 
         
             
            version = "0.0.66"
         
     | 
| 3651 | 
         
             
            description = "Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip."
         
     | 
| 3652 | 
         
             
            optional = false
         
     | 
| 3653 | 
         
            -
            python-versions = " 
     | 
| 3654 | 
         
             
            files = [
         
     | 
| 3655 | 
         
             
                {file = "infinity_emb-0.0.66-py3-none-any.whl", hash = "sha256:1dc6ed9fa48e6cbe83650a7583dbbb4bc393900c39c326bb0aff2ddc090ac018"},
         
     | 
| 3656 | 
         
             
                {file = "infinity_emb-0.0.66.tar.gz", hash = "sha256:9c9a361ccebf8e8f626c1f685286518d03d0c35e7d14179ae7c2500b4fc68b98"},
         
     | 
| 
         @@ -4098,7 +4098,7 @@ name = "litellm" 
     | 
|
| 4098 | 
         
             
            version = "1.48.0"
         
     | 
| 4099 | 
         
             
            description = "Library to easily interface with LLM API providers"
         
     | 
| 4100 | 
         
             
            optional = false
         
     | 
| 4101 | 
         
            -
            python-versions = "!=2.7 
     | 
| 4102 | 
         
             
            files = [
         
     | 
| 4103 | 
         
             
                {file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
         
     | 
| 4104 | 
         
             
                {file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
         
     | 
| 
         @@ -5416,9 +5416,9 @@ files = [ 
     | 
|
| 5416 | 
         | 
| 5417 | 
         
             
            [package.dependencies]
         
     | 
| 5418 | 
         
             
            numpy = [
         
     | 
| 5419 | 
         
            -
                {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
         
     | 
| 5420 | 
         
             
                {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
         
     | 
| 5421 | 
         
             
                {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
         
     | 
| 
         | 
|
| 5422 | 
         
             
                {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
         
     | 
| 5423 | 
         
             
            ]
         
     | 
| 5424 | 
         | 
| 
         @@ -5440,9 +5440,9 @@ files = [ 
     | 
|
| 5440 | 
         | 
| 5441 | 
         
             
            [package.dependencies]
         
     | 
| 5442 | 
         
             
            numpy = [
         
     | 
| 5443 | 
         
            -
                {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
         
     | 
| 5444 | 
         
             
                {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
         
     | 
| 5445 | 
         
             
                {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
         
     | 
| 
         | 
|
| 5446 | 
         
             
                {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
         
     | 
| 5447 | 
         
             
            ]
         
     | 
| 5448 | 
         | 
| 
         @@ -5657,8 +5657,8 @@ files = [ 
     | 
|
| 5657 | 
         | 
| 5658 | 
         
             
            [package.dependencies]
         
     | 
| 5659 | 
         
             
            numpy = [
         
     | 
| 5660 | 
         
            -
                {version = ">=1.23.2", markers = "python_version == \"3.11\""},
         
     | 
| 5661 | 
         
             
                {version = ">=1.22.4", markers = "python_version < \"3.11\""},
         
     | 
| 
         | 
|
| 5662 | 
         
             
                {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
         
     | 
| 5663 | 
         
             
            ]
         
     | 
| 5664 | 
         
             
            python-dateutil = ">=2.8.2"
         
     | 
| 
         @@ -6276,7 +6276,7 @@ name = "psutil" 
     | 
|
| 6276 | 
         
             
            version = "6.1.0"
         
     | 
| 6277 | 
         
             
            description = "Cross-platform lib for process and system monitoring in Python."
         
     | 
| 6278 | 
         
             
            optional = false
         
     | 
| 6279 | 
         
            -
            python-versions = "!=3.0 
     | 
| 6280 | 
         
             
            files = [
         
     | 
| 6281 | 
         
             
                {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
         
     | 
| 6282 | 
         
             
                {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
         
     | 
| 
         @@ -6298,8 +6298,8 @@ files = [ 
     | 
|
| 6298 | 
         
             
            ]
         
     | 
| 6299 | 
         | 
| 6300 | 
         
             
            [package.extras]
         
     | 
| 6301 | 
         
            -
            dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", " 
     | 
| 6302 | 
         
            -
            test = ["pytest", "pytest-xdist", "setuptools"]
         
     | 
| 6303 | 
         | 
| 6304 | 
         
             
            [[package]]
         
     | 
| 6305 | 
         
             
            name = "psycopg2-binary"
         
     | 
| 
         @@ -7803,40 +7803,30 @@ files = [ 
     | 
|
| 7803 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
         
     | 
| 7804 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
         
     | 
| 7805 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
         
     | 
| 7806 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
         
     | 
| 7807 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
         
     | 
| 7808 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
         
     | 
| 7809 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
         
     | 
| 7810 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
         
     | 
| 7811 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
         
     | 
| 7812 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
         
     | 
| 7813 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
         
     | 
| 7814 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
         
     | 
| 7815 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
         
     | 
| 7816 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
         
     | 
| 7817 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
         
     | 
| 7818 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
         
     | 
| 7819 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
         
     | 
| 7820 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
         
     | 
| 7821 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
         
     | 
| 7822 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
         
     | 
| 7823 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
         
     | 
| 7824 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
         
     | 
| 7825 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
         
     | 
| 7826 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
         
     | 
| 7827 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
         
     | 
| 7828 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
         
     | 
| 7829 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
         
     | 
| 7830 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
         
     | 
| 7831 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
         
     | 
| 7832 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
         
     | 
| 7833 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
         
     | 
| 7834 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
         
     | 
| 7835 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
         
     | 
| 7836 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
         
     | 
| 7837 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
         
     | 
| 7838 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
         
     | 
| 7839 | 
         
            -
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
         
     | 
| 7840 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
         
     | 
| 7841 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
         
     | 
| 7842 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
         
     | 
| 
         @@ -7847,7 +7837,7 @@ name = "s3transfer" 
     | 
|
| 7847 | 
         
             
            version = "0.10.4"
         
     | 
| 7848 | 
         
             
            description = "An Amazon S3 Transfer Manager"
         
     | 
| 7849 | 
         
             
            optional = false
         
     | 
| 7850 | 
         
            -
            python-versions = ">=3.8"
         
     | 
| 7851 | 
         
             
            files = [
         
     | 
| 7852 | 
         
             
                {file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
         
     | 
| 7853 | 
         
             
                {file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
         
     | 
| 
         @@ -8309,7 +8299,7 @@ name = "smart-open" 
     | 
|
| 8309 | 
         
             
            version = "7.0.5"
         
     | 
| 8310 | 
         
             
            description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
         
     | 
| 8311 | 
         
             
            optional = false
         
     | 
| 8312 | 
         
            -
            python-versions = " 
     | 
| 8313 | 
         
             
            files = [
         
     | 
| 8314 | 
         
             
                {file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"},
         
     | 
| 8315 | 
         
             
                {file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"},
         
     | 
| 
         @@ -10119,4 +10109,4 @@ cffi = ["cffi (>=1.11)"] 
     | 
|
| 10119 | 
         
             
            [metadata]
         
     | 
| 10120 | 
         
             
            lock-version = "2.0"
         
     | 
| 10121 | 
         
             
            python-versions = ">=3.10,<3.13"
         
     | 
| 10122 | 
         
            -
            content-hash = " 
     | 
| 
         | 
|
| 145 | 
         
             
            version = "1.2.0"
         
     | 
| 146 | 
         
             
            description = "asyncio rate limiter, a leaky bucket implementation"
         
     | 
| 147 | 
         
             
            optional = false
         
     | 
| 148 | 
         
            +
            python-versions = ">=3.8,<4.0"
         
     | 
| 149 | 
         
             
            files = [
         
     | 
| 150 | 
         
             
                {file = "aiolimiter-1.2.0-py3-none-any.whl", hash = "sha256:e3fc486a4506248cfdd1f3976920459945944518bbb1d1e6b2be1060232829e2"},
         
     | 
| 151 | 
         
             
                {file = "aiolimiter-1.2.0.tar.gz", hash = "sha256:761455d26df0d7a393f78bd39b022579e02ca5a65beb303a67bed2ded2f740ac"},
         
     | 
| 
         | 
|
| 416 | 
         
             
            version = "24.12.0"
         
     | 
| 417 | 
         
             
            description = "Aspose.Slides for Python via .NET is a presentation file formats processing library for working with Microsoft PowerPoint files without using Microsoft PowerPoint."
         
     | 
| 418 | 
         
             
            optional = false
         
     | 
| 419 | 
         
            +
            python-versions = ">=3.5,<3.14"
         
     | 
| 420 | 
         
             
            files = [
         
     | 
| 421 | 
         
             
                {file = "Aspose.Slides-24.12.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ccfaa61a863ed28cd37b221e31a0edf4a83802599d76fb50861c25149ac5e5e3"},
         
     | 
| 422 | 
         
             
                {file = "Aspose.Slides-24.12.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b050659129c5ca92e52fbcd7d5091caa244db731adb68fbea1fd0a8b9fd62a5a"},
         
     | 
| 
         | 
|
| 568 | 
         
             
            version = "0.9.23"
         
     | 
| 569 | 
         
             
            description = "BCE SDK for python"
         
     | 
| 570 | 
         
             
            optional = false
         
     | 
| 571 | 
         
            +
            python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4"
         
     | 
| 572 | 
         
             
            files = [
         
     | 
| 573 | 
         
             
                {file = "bce_python_sdk-0.9.23-py3-none-any.whl", hash = "sha256:8debe21a040e00060f6044877d594765ed7b18bc765c6bf16b878bca864140a3"},
         
     | 
| 574 | 
         
             
                {file = "bce_python_sdk-0.9.23.tar.gz", hash = "sha256:19739fed5cd0725356fc5ffa2acbdd8fb23f2a81edb91db21a03174551d0cf41"},
         
     | 
| 
         | 
|
| 1502 | 
         
             
            version = "44.0.0"
         
     | 
| 1503 | 
         
             
            description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
         
     | 
| 1504 | 
         
             
            optional = false
         
     | 
| 1505 | 
         
            +
            python-versions = ">=3.7, !=3.9.0, !=3.9.1"
         
     | 
| 1506 | 
         
             
            files = [
         
     | 
| 1507 | 
         
             
                {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"},
         
     | 
| 1508 | 
         
             
                {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"},
         
     | 
| 
         | 
|
| 1711 | 
         
             
            version = "1.2.15"
         
     | 
| 1712 | 
         
             
            description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
         
     | 
| 1713 | 
         
             
            optional = false
         
     | 
| 1714 | 
         
            +
            python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
         
     | 
| 1715 | 
         
             
            files = [
         
     | 
| 1716 | 
         
             
                {file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"},
         
     | 
| 1717 | 
         
             
                {file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"},
         
     | 
| 
         | 
|
| 2042 | 
         
             
            version = "0.3.6"
         
     | 
| 2043 | 
         
             
            description = "Fast, light, accurate library built for retrieval embedding generation"
         
     | 
| 2044 | 
         
             
            optional = false
         
     | 
| 2045 | 
         
            +
            python-versions = ">=3.8.0,<3.13"
         
     | 
| 2046 | 
         
             
            files = [
         
     | 
| 2047 | 
         
             
                {file = "fastembed-0.3.6-py3-none-any.whl", hash = "sha256:2bf70edae28bb4ccd9e01617098c2075b0ba35b88025a3d22b0e1e85b2c488ce"},
         
     | 
| 2048 | 
         
             
                {file = "fastembed-0.3.6.tar.gz", hash = "sha256:c93c8ec99b8c008c2d192d6297866b8d70ec7ac8f5696b34eb5ea91f85efd15f"},
         
     | 
| 
         | 
|
| 2624 | 
         
             
            google-auth = ">=2.14.1,<3.0.dev0"
         
     | 
| 2625 | 
         
             
            googleapis-common-protos = ">=1.56.2,<2.0.dev0"
         
     | 
| 2626 | 
         
             
            grpcio = [
         
     | 
| 
         | 
|
| 2627 | 
         
             
                {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
         
     | 
| 2628 | 
         
            +
                {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
         
     | 
| 2629 | 
         
             
            ]
         
     | 
| 2630 | 
         
             
            grpcio-status = [
         
     | 
| 
         | 
|
| 2631 | 
         
             
                {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
         
     | 
| 2632 | 
         
            +
                {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
         
     | 
| 2633 | 
         
             
            ]
         
     | 
| 2634 | 
         
             
            proto-plus = ">=1.22.3,<2.0.0dev"
         
     | 
| 2635 | 
         
             
            protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
         
     | 
| 
         | 
|
| 2965 | 
         
             
            version = "3.4.1"
         
     | 
| 2966 | 
         
             
            description = "A set of Python modules for graph statistics"
         
     | 
| 2967 | 
         
             
            optional = false
         
     | 
| 2968 | 
         
            +
            python-versions = ">=3.9,<3.13"
         
     | 
| 2969 | 
         
             
            files = [
         
     | 
| 2970 | 
         
             
                {file = "graspologic-3.4.1-py3-none-any.whl", hash = "sha256:c6563e087eda599bad1de831d4b7321c0daa7a82f4e85a7d7737ff67e07cdda2"},
         
     | 
| 2971 | 
         
             
                {file = "graspologic-3.4.1.tar.gz", hash = "sha256:7561f0b852a2bccd351bff77e8db07d9892f9dfa35a420fdec01690e4fdc8075"},
         
     | 
| 
         | 
|
| 3650 | 
         
             
            version = "0.0.66"
         
     | 
| 3651 | 
         
             
            description = "Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip."
         
     | 
| 3652 | 
         
             
            optional = false
         
     | 
| 3653 | 
         
            +
            python-versions = ">=3.9,<4"
         
     | 
| 3654 | 
         
             
            files = [
         
     | 
| 3655 | 
         
             
                {file = "infinity_emb-0.0.66-py3-none-any.whl", hash = "sha256:1dc6ed9fa48e6cbe83650a7583dbbb4bc393900c39c326bb0aff2ddc090ac018"},
         
     | 
| 3656 | 
         
             
                {file = "infinity_emb-0.0.66.tar.gz", hash = "sha256:9c9a361ccebf8e8f626c1f685286518d03d0c35e7d14179ae7c2500b4fc68b98"},
         
     | 
| 
         | 
|
| 4098 | 
         
             
            version = "1.48.0"
         
     | 
| 4099 | 
         
             
            description = "Library to easily interface with LLM API providers"
         
     | 
| 4100 | 
         
             
            optional = false
         
     | 
| 4101 | 
         
            +
            python-versions = ">=3.8, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*"
         
     | 
| 4102 | 
         
             
            files = [
         
     | 
| 4103 | 
         
             
                {file = "litellm-1.48.0-py3-none-any.whl", hash = "sha256:7765e8a92069778f5fc66aacfabd0e2f8ec8d74fb117f5e475567d89b0d376b9"},
         
     | 
| 4104 | 
         
             
                {file = "litellm-1.48.0.tar.gz", hash = "sha256:31a9b8a25a9daf44c24ddc08bf74298da920f2c5cea44135e5061278d0aa6fc9"},
         
     | 
| 
         | 
|
| 5416 | 
         | 
| 5417 | 
         
             
            [package.dependencies]
         
     | 
| 5418 | 
         
             
            numpy = [
         
     | 
| 
         | 
|
| 5419 | 
         
             
                {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
         
     | 
| 5420 | 
         
             
                {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
         
     | 
| 5421 | 
         
            +
                {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
         
     | 
| 5422 | 
         
             
                {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
         
     | 
| 5423 | 
         
             
            ]
         
     | 
| 5424 | 
         | 
| 
         | 
|
| 5440 | 
         | 
| 5441 | 
         
             
            [package.dependencies]
         
     | 
| 5442 | 
         
             
            numpy = [
         
     | 
| 
         | 
|
| 5443 | 
         
             
                {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
         
     | 
| 5444 | 
         
             
                {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
         
     | 
| 5445 | 
         
            +
                {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
         
     | 
| 5446 | 
         
             
                {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
         
     | 
| 5447 | 
         
             
            ]
         
     | 
| 5448 | 
         | 
| 
         | 
|
| 5657 | 
         | 
| 5658 | 
         
             
            [package.dependencies]
         
     | 
| 5659 | 
         
             
            numpy = [
         
     | 
| 
         | 
|
| 5660 | 
         
             
                {version = ">=1.22.4", markers = "python_version < \"3.11\""},
         
     | 
| 5661 | 
         
            +
                {version = ">=1.23.2", markers = "python_version == \"3.11\""},
         
     | 
| 5662 | 
         
             
                {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
         
     | 
| 5663 | 
         
             
            ]
         
     | 
| 5664 | 
         
             
            python-dateutil = ">=2.8.2"
         
     | 
| 
         | 
|
| 6276 | 
         
             
            version = "6.1.0"
         
     | 
| 6277 | 
         
             
            description = "Cross-platform lib for process and system monitoring in Python."
         
     | 
| 6278 | 
         
             
            optional = false
         
     | 
| 6279 | 
         
            +
            python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
         
     | 
| 6280 | 
         
             
            files = [
         
     | 
| 6281 | 
         
             
                {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
         
     | 
| 6282 | 
         
             
                {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
         
     | 
| 
         | 
|
| 6298 | 
         
             
            ]
         
     | 
| 6299 | 
         | 
| 6300 | 
         
             
            [package.extras]
         
     | 
| 6301 | 
         
            +
            dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx-rtd-theme", "toml-sort", "twine", "virtualenv", "wheel"]
         
     | 
| 6302 | 
         
            +
            test = ["enum34", "futures", "ipaddress", "mock (==1.0.1)", "pytest (==4.6.11)", "pytest-xdist", "setuptools", "unittest2"]
         
     | 
| 6303 | 
         | 
| 6304 | 
         
             
            [[package]]
         
     | 
| 6305 | 
         
             
            name = "psycopg2-binary"
         
     | 
| 
         | 
|
| 7803 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969"},
         
     | 
| 7804 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df"},
         
     | 
| 7805 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
         
     | 
| 
         | 
|
| 
         | 
|
| 7806 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
         
     | 
| 7807 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
         
     | 
| 7808 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
         
     | 
| 7809 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e"},
         
     | 
| 7810 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e"},
         
     | 
| 7811 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
         
     | 
| 
         | 
|
| 
         | 
|
| 7812 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
         
     | 
| 7813 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
         
     | 
| 7814 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
         
     | 
| 7815 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d"},
         
     | 
| 7816 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c"},
         
     | 
| 7817 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
         
     | 
| 
         | 
|
| 
         | 
|
| 7818 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
         
     | 
| 7819 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
         
     | 
| 7820 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
         
     | 
| 7821 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475"},
         
     | 
| 7822 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef"},
         
     | 
| 7823 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
         
     | 
| 
         | 
|
| 
         | 
|
| 7824 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
         
     | 
| 7825 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
         
     | 
| 7826 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
         
     | 
| 7827 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bc5f1e1c28e966d61d2519f2a3d451ba989f9ea0f2307de7bc45baa526de9e45"},
         
     | 
| 7828 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a0e060aace4c24dcaf71023bbd7d42674e3b230f7e7b97317baf1e953e5b519"},
         
     | 
| 7829 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
         
     | 
| 
         | 
|
| 
         | 
|
| 7830 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
         
     | 
| 7831 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
         
     | 
| 7832 | 
         
             
                {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
         
     | 
| 
         | 
|
| 7837 | 
         
             
            version = "0.10.4"
         
     | 
| 7838 | 
         
             
            description = "An Amazon S3 Transfer Manager"
         
     | 
| 7839 | 
         
             
            optional = false
         
     | 
| 7840 | 
         
            +
            python-versions = ">= 3.8"
         
     | 
| 7841 | 
         
             
            files = [
         
     | 
| 7842 | 
         
             
                {file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
         
     | 
| 7843 | 
         
             
                {file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
         
     | 
| 
         | 
|
| 8299 | 
         
             
            version = "7.0.5"
         
     | 
| 8300 | 
         
             
            description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
         
     | 
| 8301 | 
         
             
            optional = false
         
     | 
| 8302 | 
         
            +
            python-versions = ">=3.7,<4.0"
         
     | 
| 8303 | 
         
             
            files = [
         
     | 
| 8304 | 
         
             
                {file = "smart_open-7.0.5-py3-none-any.whl", hash = "sha256:8523ed805c12dff3eaa50e9c903a6cb0ae78800626631c5fe7ea073439847b89"},
         
     | 
| 8305 | 
         
             
                {file = "smart_open-7.0.5.tar.gz", hash = "sha256:d3672003b1dbc85e2013e4983b88eb9a5ccfd389b0d4e5015f39a9ee5620ec18"},
         
     | 
| 
         | 
|
| 10109 | 
         
             
            [metadata]
         
     | 
| 10110 | 
         
             
            lock-version = "2.0"
         
     | 
| 10111 | 
         
             
            python-versions = ">=3.10,<3.13"
         
     | 
| 10112 | 
         
            +
            content-hash = "69fbe11a30c649544196546b9384ca5972bfd17a923b7dc8ff340f790984b5df"
         
     | 
    	
        pyproject.toml
    CHANGED
    
    | 
         @@ -121,6 +121,7 @@ pyicu = "^2.13.1" 
     | 
|
| 121 | 
         
             
            flasgger = "^0.9.7.1"
         
     | 
| 122 | 
         
             
            polars = { version = "^1.9.0", markers = "platform_machine == 'x86_64'" }
         
     | 
| 123 | 
         
             
            polars-lts-cpu = { version = "^1.9.0", markers = "platform_machine == 'arm64'" }
         
     | 
| 
         | 
|
| 124 | 
         | 
| 125 | 
         | 
| 126 | 
         
             
            [tool.poetry.group.full]
         
     | 
| 
         | 
|
| 121 | 
         
             
            flasgger = "^0.9.7.1"
         
     | 
| 122 | 
         
             
            polars = { version = "^1.9.0", markers = "platform_machine == 'x86_64'" }
         
     | 
| 123 | 
         
             
            polars-lts-cpu = { version = "^1.9.0", markers = "platform_machine == 'arm64'" }
         
     | 
| 124 | 
         
            +
            xxhash = "^3.5.0"
         
     | 
| 125 | 
         | 
| 126 | 
         | 
| 127 | 
         
             
            [tool.poetry.group.full]
         
     | 
    	
        rag/svr/task_executor.py
    CHANGED
    
    | 
         @@ -39,8 +39,9 @@ from timeit import default_timer as timer 
     | 
|
| 39 | 
         
             
            import tracemalloc
         
     | 
| 40 | 
         | 
| 41 | 
         
             
            import numpy as np
         
     | 
| 
         | 
|
| 42 | 
         | 
| 43 | 
         
            -
            from api.db import LLMType, ParserType
         
     | 
| 44 | 
         
             
            from api.db.services.dialog_service import keyword_extraction, question_proposal
         
     | 
| 45 | 
         
             
            from api.db.services.document_service import DocumentService
         
     | 
| 46 | 
         
             
            from api.db.services.llm_service import LLMBundle
         
     | 
| 
         @@ -89,12 +90,23 @@ DONE_TASKS = 0 
     | 
|
| 89 | 
         
             
            FAILED_TASKS = 0
         
     | 
| 90 | 
         
             
            CURRENT_TASK = None
         
     | 
| 91 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 92 | 
         | 
| 93 | 
         
             
            def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
         
     | 
| 94 | 
         
             
                global PAYLOAD
         
     | 
| 95 | 
         
             
                if prog is not None and prog < 0:
         
     | 
| 96 | 
         
             
                    msg = "[ERROR]" + msg
         
     | 
| 97 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 98 | 
         
             
                if cancel:
         
     | 
| 99 | 
         
             
                    msg += " [Canceled]"
         
     | 
| 100 | 
         
             
                    prog = -1
         
     | 
| 
         @@ -105,18 +117,22 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing... 
     | 
|
| 105 | 
         
             
                d = {"progress_msg": msg}
         
     | 
| 106 | 
         
             
                if prog is not None:
         
     | 
| 107 | 
         
             
                    d["progress"] = prog
         
     | 
| 
         | 
|
| 
         | 
|
| 108 | 
         
             
                try:
         
     | 
| 109 | 
         
            -
                    logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
         
     | 
| 110 | 
         
             
                    TaskService.update_progress(task_id, d)
         
     | 
| 111 | 
         
            -
                except  
     | 
| 112 | 
         
            -
                    logging. 
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
                close_connection()
         
     | 
| 115 | 
         
            -
                if cancel:
         
     | 
| 116 | 
         
             
                    if PAYLOAD:
         
     | 
| 117 | 
         
             
                        PAYLOAD.ack()
         
     | 
| 118 | 
         
             
                        PAYLOAD = None
         
     | 
| 119 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 120 | 
         | 
| 121 | 
         | 
| 122 | 
         
             
            def collect():
         
     | 
| 
         @@ -136,16 +152,22 @@ def collect(): 
     | 
|
| 136 | 
         
             
                if not msg:
         
     | 
| 137 | 
         
             
                    return None
         
     | 
| 138 | 
         | 
| 139 | 
         
            -
                 
     | 
| 140 | 
         
            -
             
     | 
| 141 | 
         
            -
             
     | 
| 142 | 
         
            -
                     
     | 
| 143 | 
         
            -
                     
     | 
| 144 | 
         
            -
             
     | 
| 145 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 146 | 
         
             
                    with mt_lock:
         
     | 
| 147 | 
         
             
                        DONE_TASKS += 1
         
     | 
| 148 | 
         
            -
                    logging. 
     | 
| 149 | 
         
             
                    return None
         
     | 
| 150 | 
         | 
| 151 | 
         
             
                if msg.get("type", "") == "raptor":
         
     | 
| 
         @@ -186,6 +208,8 @@ def build_chunks(task, progress_callback): 
     | 
|
| 186 | 
         
             
                                        to_page=task["to_page"], lang=task["language"], callback=progress_callback,
         
     | 
| 187 | 
         
             
                                        kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
         
     | 
| 188 | 
         
             
                    logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
         
     | 
| 
         | 
|
| 
         | 
|
| 189 | 
         
             
                except Exception as e:
         
     | 
| 190 | 
         
             
                    progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
         
     | 
| 191 | 
         
             
                    logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
         
     | 
| 
         @@ -358,6 +382,8 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None): 
     | 
|
| 358 | 
         
             
                return res, tk_count, vector_size
         
     | 
| 359 | 
         | 
| 360 | 
         | 
| 
         | 
|
| 
         | 
|
| 361 | 
         
             
            def do_handle_task(task):
         
     | 
| 362 | 
         
             
                task_id = task["id"]
         
     | 
| 363 | 
         
             
                task_from_page = task["from_page"]
         
     | 
| 
         @@ -373,6 +399,16 @@ def do_handle_task(task): 
     | 
|
| 373 | 
         | 
| 374 | 
         
             
                # prepare the progress callback function
         
     | 
| 375 | 
         
             
                progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 376 | 
         
             
                try:
         
     | 
| 377 | 
         
             
                    # bind embedding model
         
     | 
| 378 | 
         
             
                    embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
         
     | 
| 
         @@ -390,6 +426,8 @@ def do_handle_task(task): 
     | 
|
| 390 | 
         | 
| 391 | 
         
             
                        # run RAPTOR
         
     | 
| 392 | 
         
             
                        chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
         
     | 
| 
         | 
|
| 
         | 
|
| 393 | 
         
             
                    except Exception as e:
         
     | 
| 394 | 
         
             
                        error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
         
     | 
| 395 | 
         
             
                        progress_callback(-1, msg=error_message)
         
     | 
| 
         @@ -420,6 +458,7 @@ def do_handle_task(task): 
     | 
|
| 420 | 
         
             
                    progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
         
     | 
| 421 | 
         
             
                    logging.info(progress_message)
         
     | 
| 422 | 
         
             
                    progress_callback(msg=progress_message)
         
     | 
| 
         | 
|
| 423 | 
         
             
                # logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
         
     | 
| 424 | 
         
             
                init_kb(task, vector_size)
         
     | 
| 425 | 
         
             
                chunk_count = len(set([chunk["id"] for chunk in chunks]))
         
     | 
| 
         @@ -430,23 +469,25 @@ def do_handle_task(task): 
     | 
|
| 430 | 
         
             
                    doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
         
     | 
| 431 | 
         
             
                    if b % 128 == 0:
         
     | 
| 432 | 
         
             
                        progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
         
     | 
| 433 | 
         
            -
             
     | 
| 434 | 
         
            -
             
     | 
| 435 | 
         
            -
             
     | 
| 436 | 
         
            -
             
     | 
| 437 | 
         
            -
                     
     | 
| 438 | 
         
            -
                     
     | 
| 439 | 
         
            -
                     
     | 
| 440 | 
         
            -
             
     | 
| 441 | 
         
            -
             
     | 
| 442 | 
         
            -
             
     | 
| 443 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 444 | 
         | 
| 445 | 
         
             
                DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
         
     | 
| 446 | 
         | 
| 447 | 
         
             
                time_cost = timer() - start_ts
         
     | 
| 448 | 
         
             
                progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
         
     | 
| 449 | 
         
            -
                logging.info("Chunk doc({}),  
     | 
| 450 | 
         | 
| 451 | 
         | 
| 452 | 
         
             
            def handle_task():
         
     | 
| 
         @@ -462,6 +503,12 @@ def handle_task(): 
     | 
|
| 462 | 
         
             
                            DONE_TASKS += 1
         
     | 
| 463 | 
         
             
                            CURRENT_TASK = None
         
     | 
| 464 | 
         
             
                        logging.info(f"handle_task done for task {json.dumps(task)}")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 465 | 
         
             
                    except Exception:
         
     | 
| 466 | 
         
             
                        with mt_lock:
         
     | 
| 467 | 
         
             
                            FAILED_TASKS += 1
         
     | 
| 
         | 
|
| 39 | 
         
             
            import tracemalloc
         
     | 
| 40 | 
         | 
| 41 | 
         
             
            import numpy as np
         
     | 
| 42 | 
         
            +
            from peewee import DoesNotExist
         
     | 
| 43 | 
         | 
| 44 | 
         
            +
            from api.db import LLMType, ParserType, TaskStatus
         
     | 
| 45 | 
         
             
            from api.db.services.dialog_service import keyword_extraction, question_proposal
         
     | 
| 46 | 
         
             
            from api.db.services.document_service import DocumentService
         
     | 
| 47 | 
         
             
            from api.db.services.llm_service import LLMBundle
         
     | 
| 
         | 
|
| 90 | 
         
             
            FAILED_TASKS = 0
         
     | 
| 91 | 
         
             
            CURRENT_TASK = None
         
     | 
| 92 | 
         | 
| 93 | 
         
            +
            class TaskCanceledException(Exception):
         
     | 
| 94 | 
         
            +
                def __init__(self, msg):
         
     | 
| 95 | 
         
            +
                    self.msg = msg
         
     | 
| 96 | 
         | 
| 97 | 
         
             
            def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
         
     | 
| 98 | 
         
             
                global PAYLOAD
         
     | 
| 99 | 
         
             
                if prog is not None and prog < 0:
         
     | 
| 100 | 
         
             
                    msg = "[ERROR]" + msg
         
     | 
| 101 | 
         
            +
                try:
         
     | 
| 102 | 
         
            +
                    cancel = TaskService.do_cancel(task_id)
         
     | 
| 103 | 
         
            +
                except DoesNotExist:
         
     | 
| 104 | 
         
            +
                    logging.warning(f"set_progress task {task_id} is unknown")
         
     | 
| 105 | 
         
            +
                    if PAYLOAD:
         
     | 
| 106 | 
         
            +
                        PAYLOAD.ack()
         
     | 
| 107 | 
         
            +
                        PAYLOAD = None
         
     | 
| 108 | 
         
            +
                    return
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
             
                if cancel:
         
     | 
| 111 | 
         
             
                    msg += " [Canceled]"
         
     | 
| 112 | 
         
             
                    prog = -1
         
     | 
| 
         | 
|
| 117 | 
         
             
                d = {"progress_msg": msg}
         
     | 
| 118 | 
         
             
                if prog is not None:
         
     | 
| 119 | 
         
             
                    d["progress"] = prog
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
                logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
         
     | 
| 122 | 
         
             
                try:
         
     | 
| 
         | 
|
| 123 | 
         
             
                    TaskService.update_progress(task_id, d)
         
     | 
| 124 | 
         
            +
                except DoesNotExist:
         
     | 
| 125 | 
         
            +
                    logging.warning(f"set_progress task {task_id} is unknown")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 126 | 
         
             
                    if PAYLOAD:
         
     | 
| 127 | 
         
             
                        PAYLOAD.ack()
         
     | 
| 128 | 
         
             
                        PAYLOAD = None
         
     | 
| 129 | 
         
            +
                    return
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
                close_connection()
         
     | 
| 132 | 
         
            +
                if cancel and PAYLOAD:
         
     | 
| 133 | 
         
            +
                    PAYLOAD.ack()
         
     | 
| 134 | 
         
            +
                    PAYLOAD = None
         
     | 
| 135 | 
         
            +
                    raise TaskCanceledException(msg)
         
     | 
| 136 | 
         | 
| 137 | 
         | 
| 138 | 
         
             
            def collect():
         
     | 
| 
         | 
|
| 152 | 
         
             
                if not msg:
         
     | 
| 153 | 
         
             
                    return None
         
     | 
| 154 | 
         | 
| 155 | 
         
            +
                task = None
         
     | 
| 156 | 
         
            +
                canceled = False
         
     | 
| 157 | 
         
            +
                try:
         
     | 
| 158 | 
         
            +
                    task = TaskService.get_task(msg["id"])
         
     | 
| 159 | 
         
            +
                    if task:
         
     | 
| 160 | 
         
            +
                        _, doc = DocumentService.get_by_id(task["doc_id"])
         
     | 
| 161 | 
         
            +
                        canceled = doc.run == TaskStatus.CANCEL.value or doc.progress < 0
         
     | 
| 162 | 
         
            +
                except DoesNotExist:
         
     | 
| 163 | 
         
            +
                    pass
         
     | 
| 164 | 
         
            +
                except Exception:
         
     | 
| 165 | 
         
            +
                    logging.exception("collect get_task exception")
         
     | 
| 166 | 
         
            +
                if not task or canceled:
         
     | 
| 167 | 
         
            +
                    state = "is unknown" if not task else "has been cancelled"
         
     | 
| 168 | 
         
             
                    with mt_lock:
         
     | 
| 169 | 
         
             
                        DONE_TASKS += 1
         
     | 
| 170 | 
         
            +
                    logging.info(f"collect task {msg['id']} {state}")
         
     | 
| 171 | 
         
             
                    return None
         
     | 
| 172 | 
         | 
| 173 | 
         
             
                if msg.get("type", "") == "raptor":
         
     | 
| 
         | 
|
| 208 | 
         
             
                                        to_page=task["to_page"], lang=task["language"], callback=progress_callback,
         
     | 
| 209 | 
         
             
                                        kb_id=task["kb_id"], parser_config=task["parser_config"], tenant_id=task["tenant_id"])
         
     | 
| 210 | 
         
             
                    logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
         
     | 
| 211 | 
         
            +
                except TaskCanceledException:
         
     | 
| 212 | 
         
            +
                    raise
         
     | 
| 213 | 
         
             
                except Exception as e:
         
     | 
| 214 | 
         
             
                    progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
         
     | 
| 215 | 
         
             
                    logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
         
     | 
| 
         | 
|
| 382 | 
         
             
                return res, tk_count, vector_size
         
     | 
| 383 | 
         | 
| 384 | 
         | 
| 385 | 
         
            +
             
     | 
| 386 | 
         
            +
             
     | 
| 387 | 
         
             
            def do_handle_task(task):
         
     | 
| 388 | 
         
             
                task_id = task["id"]
         
     | 
| 389 | 
         
             
                task_from_page = task["from_page"]
         
     | 
| 
         | 
|
| 399 | 
         | 
| 400 | 
         
             
                # prepare the progress callback function
         
     | 
| 401 | 
         
             
                progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
         
     | 
| 402 | 
         
            +
             
     | 
| 403 | 
         
            +
                try:
         
     | 
| 404 | 
         
            +
                    task_canceled = TaskService.do_cancel(task_id)
         
     | 
| 405 | 
         
            +
                except DoesNotExist:
         
     | 
| 406 | 
         
            +
                    logging.warning(f"task {task_id} is unknown")
         
     | 
| 407 | 
         
            +
                    return
         
     | 
| 408 | 
         
            +
                if task_canceled:
         
     | 
| 409 | 
         
            +
                    progress_callback(-1, msg="Task has been canceled.")
         
     | 
| 410 | 
         
            +
                    return
         
     | 
| 411 | 
         
            +
             
     | 
| 412 | 
         
             
                try:
         
     | 
| 413 | 
         
             
                    # bind embedding model
         
     | 
| 414 | 
         
             
                    embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
         
     | 
| 
         | 
|
| 426 | 
         | 
| 427 | 
         
             
                        # run RAPTOR
         
     | 
| 428 | 
         
             
                        chunks, token_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
         
     | 
| 429 | 
         
            +
                    except TaskCanceledException:
         
     | 
| 430 | 
         
            +
                        raise
         
     | 
| 431 | 
         
             
                    except Exception as e:
         
     | 
| 432 | 
         
             
                        error_message = f'Fail to bind LLM used by RAPTOR: {str(e)}'
         
     | 
| 433 | 
         
             
                        progress_callback(-1, msg=error_message)
         
     | 
| 
         | 
|
| 458 | 
         
             
                    progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
         
     | 
| 459 | 
         
             
                    logging.info(progress_message)
         
     | 
| 460 | 
         
             
                    progress_callback(msg=progress_message)
         
     | 
| 461 | 
         
            +
             
     | 
| 462 | 
         
             
                # logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
         
     | 
| 463 | 
         
             
                init_kb(task, vector_size)
         
     | 
| 464 | 
         
             
                chunk_count = len(set([chunk["id"] for chunk in chunks]))
         
     | 
| 
         | 
|
| 469 | 
         
             
                    doc_store_result = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
         
     | 
| 470 | 
         
             
                    if b % 128 == 0:
         
     | 
| 471 | 
         
             
                        progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
         
     | 
| 472 | 
         
            +
                    if doc_store_result:
         
     | 
| 473 | 
         
            +
                        error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
         
     | 
| 474 | 
         
            +
                        progress_callback(-1, msg=error_message)
         
     | 
| 475 | 
         
            +
                        raise Exception(error_message)
         
     | 
| 476 | 
         
            +
                    chunk_ids = [chunk["id"] for chunk in chunks[:b + es_bulk_size]]
         
     | 
| 477 | 
         
            +
                    chunk_ids_str = " ".join(chunk_ids)
         
     | 
| 478 | 
         
            +
                    try:
         
     | 
| 479 | 
         
            +
                        TaskService.update_chunk_ids(task["id"], chunk_ids_str)
         
     | 
| 480 | 
         
            +
                    except DoesNotExist:
         
     | 
| 481 | 
         
            +
                        logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.")
         
     | 
| 482 | 
         
            +
                        doc_store_result = settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id)
         
     | 
| 483 | 
         
            +
                        return
         
     | 
| 484 | 
         
            +
                logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), timer() - start_ts))
         
     | 
| 485 | 
         | 
| 486 | 
         
             
                DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
         
     | 
| 487 | 
         | 
| 488 | 
         
             
                time_cost = timer() - start_ts
         
     | 
| 489 | 
         
             
                progress_callback(prog=1.0, msg="Done ({:.2f}s)".format(time_cost))
         
     | 
| 490 | 
         
            +
                logging.info("Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), token_count, time_cost))
         
     | 
| 491 | 
         | 
| 492 | 
         | 
| 493 | 
         
             
            def handle_task():
         
     | 
| 
         | 
|
| 503 | 
         
             
                            DONE_TASKS += 1
         
     | 
| 504 | 
         
             
                            CURRENT_TASK = None
         
     | 
| 505 | 
         
             
                        logging.info(f"handle_task done for task {json.dumps(task)}")
         
     | 
| 506 | 
         
            +
                    except TaskCanceledException:
         
     | 
| 507 | 
         
            +
                        with mt_lock:
         
     | 
| 508 | 
         
            +
                            DONE_TASKS += 1
         
     | 
| 509 | 
         
            +
                            CURRENT_TASK = None
         
     | 
| 510 | 
         
            +
                        logging.info(f"handle_task got TaskCanceledException for task {json.dumps(task)}")
         
     | 
| 511 | 
         
            +
                        logging.debug("handle_task got TaskCanceledException", exc_info=True)
         
     | 
| 512 | 
         
             
                    except Exception:
         
     | 
| 513 | 
         
             
                        with mt_lock:
         
     | 
| 514 | 
         
             
                            FAILED_TASKS += 1
         
     |