Spaces:

retopara
/

ragflow

Build error

JobSmithManipulation Kevin Hu commited on Sep 14, 2024

Commit

278278b

1 Parent(s): bac5213

update sdk document and chunk (#2421)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <[email protected]>

Files changed (12) hide show

api/apps/sdk/doc.py +373 -25
sdk/python/ragflow/__init__.py +2 -1
sdk/python/ragflow/modules/chunk.py +34 -0
sdk/python/ragflow/modules/document.py +123 -2
sdk/python/ragflow/ragflow.py +47 -0
sdk/python/test/t_document.py +128 -3
sdk/python/test/test_data/ragflow_test.txt +29 -0
sdk/python/test/test_data/story.txt +8 -0
sdk/python/test/test_data/test1.txt +3 -1
sdk/python/test/test_data/test2.txt +4 -0
sdk/python/test/test_data/test3.txt +4 -0
sdk/python/test/test_data/westworld.pdf +0 -0

api/apps/sdk/doc.py CHANGED Viewed

@@ -1,19 +1,63 @@
-from io import BytesIO
-from flask import request,send_file
-from api.utils.api_utils import get_json_result, construct_json_result, server_error_response
 from api.utils.api_utils import get_json_result, token_required, get_data_error_result
-from api.db import FileType, ParserType, FileSource, TaskStatus
 from api.db.db_models import File
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.user_service import TenantService, UserTenantService
-from api.settings import RetCode
 from api.utils.api_utils import construct_json_result, construct_error_response
 from rag.utils.storage_factory import STORAGE_IMPL
 @manager.route('/dataset/<dataset_id>/documents/upload', methods=['POST'])
 @token_required
@@ -54,34 +98,169 @@ def docinfos(tenant_id):
 @manager.route('/save', methods=['POST'])
 @token_required
 def save_doc(tenant_id):
-    req = request.json  # Expecting JSON input
     if "id" in req:
         doc_id = req["id"]
-    if "name" in req:
         doc_name = req["name"]
         doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
-    data = request.json
-    # Call the update method with the provided id and data
     try:
-        num = DocumentService.update_by_id(doc_id, data)
-        if num > 0:
-            return get_json_result(retmsg="success", data={"updated_count": num})
-        else:
-            return get_json_result(retcode=404, retmsg="Document not found")
     except Exception as e:
-        return get_json_result(retmsg=f"Error occurred: {str(e)}")
-@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
 @token_required
 def download_document(dataset_id, document_id):
     try:
-        # Check whether there is this dataset
-        exist, _ = KnowledgebaseService.get_by_id(dataset_id)
-        if not exist:
-            return construct_json_result(code=RetCode.DATA_ERROR,
-                                         message=f"This dataset '{dataset_id}' cannot be found!")
         # Check whether there is this document
         exist, document = DocumentService.get_by_id(document_id)
         if not exist:
@@ -108,9 +287,10 @@ def download_document(dataset_id, document_id):
     except Exception as e:
         return construct_error_response(e)
 @manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
 @token_required
-def list_docs(dataset_id,tenant_id):
     kb_id = request.args.get("kb_id")
     if not kb_id:
         return get_json_result(
@@ -177,4 +357,172 @@ def rm(tenant_id):
     if errors:
         return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
-    return get_json_result(data=True,retmsg="success")

+import pathlib
+import re
+import datetime
+import json
+import traceback
+from flask import request
+from flask_login import login_required, current_user
+from elasticsearch_dsl import Q
+from rag.app.qa import rmPrefix, beAdoc
+from rag.nlp import search, rag_tokenizer, keyword_extraction
+from rag.utils.es_conn import ELASTICSEARCH
+from rag.utils import rmSpace
+from api.db import LLMType, ParserType
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.db.services.llm_service import TenantLLMService
+from api.db.services.user_service import UserTenantService
+from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
+from api.db.services.document_service import DocumentService
+from api.settings import RetCode, retrievaler, kg_retrievaler
+from api.utils.api_utils import get_json_result
+import hashlib
+import re
 from api.utils.api_utils import get_json_result, token_required, get_data_error_result
+from api.db.db_models import Task, File
+from api.db.services.task_service import TaskService, queue_tasks
+from api.db.services.user_service import TenantService, UserTenantService
+from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
+from api.utils.api_utils import get_json_result
+from functools import partial
+from io import BytesIO
+from elasticsearch_dsl import Q
+from flask import request, send_file
+from flask_login import login_required
+from api.db import FileSource, TaskStatus, FileType
 from api.db.db_models import File
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.settings import RetCode, retrievaler
 from api.utils.api_utils import construct_json_result, construct_error_response
+from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
+from rag.nlp import search
+from rag.utils import rmSpace
+from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.storage_factory import STORAGE_IMPL
+MAXIMUM_OF_UPLOADING_FILES = 256
+MAXIMUM_OF_UPLOADING_FILES = 256
 @manager.route('/dataset/<dataset_id>/documents/upload', methods=['POST'])
 @token_required
 @manager.route('/save', methods=['POST'])
 @token_required
 def save_doc(tenant_id):
+    req = request.json
+    #get doc by id or name
+    doc_id = None
     if "id" in req:
         doc_id = req["id"]
+    elif "name" in req:
         doc_name = req["name"]
         doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
+    if not doc_id:
+        return get_json_result(retcode=400, retmsg="Document ID or name is required")
+    e, doc = DocumentService.get_by_id(doc_id)
+    if not e:
+        return get_data_error_result(retmsg="Document not found!")
+    #other value can't be changed
+    if "chunk_num" in req:
+        if req["chunk_num"] != doc.chunk_num:
+            return get_data_error_result(
+                retmsg="Can't change chunk_count.")
+    if "progress" in req:
+        if req['progress'] != doc.progress:
+            return get_data_error_result(
+                retmsg="Can't change progress.")
+    #change name or parse_method
+    if "name" in req and req["name"] != doc.name:
+        try:
+            if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
+                    doc.name.lower()).suffix:
+                return get_json_result(
+                    data=False,
+                    retmsg="The extension of file can't be changed",
+                    retcode=RetCode.ARGUMENT_ERROR)
+            for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
+                if d.name == req["name"]:
+                    return get_data_error_result(
+                        retmsg="Duplicated document name in the same knowledgebase.")
+            if not DocumentService.update_by_id(
+                    doc_id, {"name": req["name"]}):
+                return get_data_error_result(
+                    retmsg="Database error (Document rename)!")
+            informs = File2DocumentService.get_by_document_id(doc_id)
+            if informs:
+                e, file = FileService.get_by_id(informs[0].file_id)
+                FileService.update_by_id(file.id, {"name": req["name"]})
+        except Exception as e:
+            return server_error_response(e)
+    if "parser_id" in req:
+        try:
+            if doc.parser_id.lower() == req["parser_id"].lower():
+                if "parser_config" in req:
+                    if req["parser_config"] == doc.parser_config:
+                        return get_json_result(data=True)
+                else:
+                    return get_json_result(data=True)
+            if doc.type == FileType.VISUAL or re.search(
+                    r"\.(ppt|pptx|pages)$", doc.name):
+                return get_data_error_result(retmsg="Not supported yet!")
+            e = DocumentService.update_by_id(doc.id,
+                                             {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
+                                              "run": TaskStatus.UNSTART.value})
+            if not e:
+                return get_data_error_result(retmsg="Document not found!")
+            if "parser_config" in req:
+                DocumentService.update_parser_config(doc.id, req["parser_config"])
+            if doc.token_num > 0:
+                e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
+                                                        doc.process_duation * -1)
+                if not e:
+                    return get_data_error_result(retmsg="Document not found!")
+                tenant_id = DocumentService.get_tenant_id(req["doc_id"])
+                if not tenant_id:
+                    return get_data_error_result(retmsg="Tenant not found!")
+                ELASTICSEARCH.deleteByQuery(
+                    Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
+        except Exception as e:
+            return server_error_response(e)
+    return get_json_result(data=True)
+@manager.route('/change_parser', methods=['POST'])
+@token_required
+def change_parser(tenant_id):
+    req = request.json
+    try:
+        e, doc = DocumentService.get_by_id(req["doc_id"])
+        if not e:
+            return get_data_error_result(retmsg="Document not found!")
+        if doc.parser_id.lower() == req["parser_id"].lower():
+            if "parser_config" in req:
+                if req["parser_config"] == doc.parser_config:
+                    return get_json_result(data=True)
+            else:
+                return get_json_result(data=True)
+        if doc.type == FileType.VISUAL or re.search(
+                r"\.(ppt|pptx|pages)$", doc.name):
+            return get_data_error_result(retmsg="Not supported yet!")
+        e = DocumentService.update_by_id(doc.id,
+                                         {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
+                                          "run": TaskStatus.UNSTART.value})
+        if not e:
+            return get_data_error_result(retmsg="Document not found!")
+        if "parser_config" in req:
+            DocumentService.update_parser_config(doc.id, req["parser_config"])
+        if doc.token_num > 0:
+            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
+                                                    doc.process_duation * -1)
+            if not e:
+                return get_data_error_result(retmsg="Document not found!")
+            tenant_id = DocumentService.get_tenant_id(req["doc_id"])
+            if not tenant_id:
+                return get_data_error_result(retmsg="Tenant not found!")
+            ELASTICSEARCH.deleteByQuery(
+                Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
+        return get_json_result(data=True)
+    except Exception as e:
+        return server_error_response(e)
+@manager.route('/rename', methods=['POST'])
+@login_required
+@validate_request("doc_id", "name")
+def rename():
+    req = request.json
     try:
+        e, doc = DocumentService.get_by_id(req["doc_id"])
+        if not e:
+            return get_data_error_result(retmsg="Document not found!")
+        if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
+                doc.name.lower()).suffix:
+            return get_json_result(
+                data=False,
+                retmsg="The extension of file can't be changed",
+                retcode=RetCode.ARGUMENT_ERROR)
+        for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
+            if d.name == req["name"]:
+                return get_data_error_result(
+                    retmsg="Duplicated document name in the same knowledgebase.")
+        if not DocumentService.update_by_id(
+                req["doc_id"], {"name": req["name"]}):
+            return get_data_error_result(
+                retmsg="Database error (Document rename)!")
+        informs = File2DocumentService.get_by_document_id(req["doc_id"])
+        if informs:
+            e, file = FileService.get_by_id(informs[0].file_id)
+            FileService.update_by_id(file.id, {"name": req["name"]})
+        return get_json_result(data=True)
     except Exception as e:
+        return server_error_response(e)
+@manager.route("/<document_id>", methods=["GET"])
 @token_required
 def download_document(dataset_id, document_id):
     try:
         # Check whether there is this document
         exist, document = DocumentService.get_by_id(document_id)
         if not exist:
     except Exception as e:
         return construct_error_response(e)
 @manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
 @token_required
+def list_docs(dataset_id, tenant_id):
     kb_id = request.args.get("kb_id")
     if not kb_id:
         return get_json_result(
     if errors:
         return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
+    return get_json_result(data=True, retmsg="success")
+@manager.route("/<document_id>/status", methods=["GET"])
+@token_required
+def show_parsing_status(tenant_id, document_id):
+    try:
+        # valid document
+        exist, _ = DocumentService.get_by_id(document_id)
+        if not exist:
+            return construct_json_result(code=RetCode.DATA_ERROR,
+                                         message=f"This document: '{document_id}' is not a valid document.")
+        _, doc = DocumentService.get_by_id(document_id)  # get doc object
+        doc_attributes = doc.to_dict()
+        return construct_json_result(
+            data={"progress": doc_attributes["progress"], "status": TaskStatus(doc_attributes["status"]).name},
+            code=RetCode.SUCCESS
+        )
+    except Exception as e:
+        return construct_error_response(e)
+@manager.route('/run', methods=['POST'])
+@token_required
+def run(tenant_id):
+    req = request.json
+    try:
+        for id in req["doc_ids"]:
+            info = {"run": str(req["run"]), "progress": 0}
+            if str(req["run"]) == TaskStatus.RUNNING.value:
+                info["progress_msg"] = ""
+                info["chunk_num"] = 0
+                info["token_num"] = 0
+            DocumentService.update_by_id(id, info)
+            # if str(req["run"]) == TaskStatus.CANCEL.value:
+            tenant_id = DocumentService.get_tenant_id(id)
+            if not tenant_id:
+                return get_data_error_result(retmsg="Tenant not found!")
+            ELASTICSEARCH.deleteByQuery(
+                Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+            if str(req["run"]) == TaskStatus.RUNNING.value:
+                TaskService.filter_delete([Task.doc_id == id])
+                e, doc = DocumentService.get_by_id(id)
+                doc = doc.to_dict()
+                doc["tenant_id"] = tenant_id
+                bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
+                queue_tasks(doc, bucket, name)
+        return get_json_result(data=True)
+    except Exception as e:
+        return server_error_response(e)
+@manager.route('/chunk/list', methods=['POST'])
+@token_required
+@validate_request("doc_id")
+def list_chunk(tenant_id):
+    req = request.json
+    doc_id = req["doc_id"]
+    page = int(req.get("page", 1))
+    size = int(req.get("size", 30))
+    question = req.get("keywords", "")
+    try:
+        tenant_id = DocumentService.get_tenant_id(req["doc_id"])
+        if not tenant_id:
+            return get_data_error_result(retmsg="Tenant not found!")
+        e, doc = DocumentService.get_by_id(doc_id)
+        if not e:
+            return get_data_error_result(retmsg="Document not found!")
+        query = {
+            "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
+        }
+        if "available_int" in req:
+            query["available_int"] = int(req["available_int"])
+        sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
+        res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
+        for id in sres.ids:
+            d = {
+                "chunk_id": id,
+                "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
+                    id].get(
+                    "content_with_weight", ""),
+                "doc_id": sres.field[id]["doc_id"],
+                "docnm_kwd": sres.field[id]["docnm_kwd"],
+                "important_kwd": sres.field[id].get("important_kwd", []),
+                "img_id": sres.field[id].get("img_id", ""),
+                "available_int": sres.field[id].get("available_int", 1),
+                "positions": sres.field[id].get("position_int", "").split("\t")
+            }
+            if len(d["positions"]) % 5 == 0:
+                poss = []
+                for i in range(0, len(d["positions"]), 5):
+                    poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
+                                 float(d["positions"][i + 3]), float(d["positions"][i + 4])])
+                d["positions"] = poss
+            res["chunks"].append(d)
+        return get_json_result(data=res)
+    except Exception as e:
+        if str(e).find("not_found") > 0:
+            return get_json_result(data=False, retmsg=f'No chunk found!',
+                                   retcode=RetCode.DATA_ERROR)
+        return server_error_response(e)
+@manager.route('/chunk/create', methods=['POST'])
+@token_required
+@validate_request("doc_id", "content_with_weight")
+def create(tenant_id):
+    req = request.json
+    md5 = hashlib.md5()
+    md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
+    chunck_id = md5.hexdigest()
+    d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
+         "content_with_weight": req["content_with_weight"]}
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    d["important_kwd"] = req.get("important_kwd", [])
+    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
+    d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
+    d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
+    try:
+        e, doc = DocumentService.get_by_id(req["doc_id"])
+        if not e:
+            return get_data_error_result(retmsg="Document not found!")
+        d["kb_id"] = [doc.kb_id]
+        d["docnm_kwd"] = doc.name
+        d["doc_id"] = doc.id
+        tenant_id = DocumentService.get_tenant_id(req["doc_id"])
+        if not tenant_id:
+            return get_data_error_result(retmsg="Tenant not found!")
+        embd_id = DocumentService.get_embd_id(req["doc_id"])
+        embd_mdl = TenantLLMService.model_instance(
+            tenant_id, LLMType.EMBEDDING.value, embd_id)
+        v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
+        v = 0.1 * v[0] + 0.9 * v[1]
+        d["q_%d_vec" % len(v)] = v.tolist()
+        ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
+        DocumentService.increment_chunk_num(
+            doc.id, doc.kb_id, c, 1, 0)
+        return get_json_result(data={"chunk": d})
+        # return get_json_result(data={"chunk_id": chunck_id})
+    except Exception as e:
+        return server_error_response(e)
+@manager.route('/chunk/rm', methods=['POST'])
+@token_required
+@validate_request("chunk_ids", "doc_id")
+def rm():
+    req = request.json
+    try:
+        if not ELASTICSEARCH.deleteByQuery(
+                Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
+            return get_data_error_result(retmsg="Index updating failure")
+        e, doc = DocumentService.get_by_id(req["doc_id"])
+        if not e:
+            return get_data_error_result(retmsg="Document not found!")
+        deleted_chunk_ids = req["chunk_ids"]
+        chunk_number = len(deleted_chunk_ids)
+        DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
+        return get_json_result(data=True)
+    except Exception as e:
+        return server_error_response(e)

sdk/python/ragflow/__init__.py CHANGED Viewed

@@ -6,4 +6,5 @@ from .ragflow import RAGFlow
 from .modules.dataset import DataSet
 from .modules.assistant import Assistant
 from .modules.session import Session
-from .modules.document import Document

 from .modules.dataset import DataSet
 from .modules.assistant import Assistant
 from .modules.session import Session
+from .modules.document import Document
+from .modules.chunk import Chunk

sdk/python/ragflow/modules/chunk.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from .base import Base
+class Chunk(Base):
+    def __init__(self, rag, res_dict):
+        # 初始化类的属性
+        self.id = ""
+        self.content_with_weight = ""
+        self.content_ltks = []
+        self.content_sm_ltks = []
+        self.important_kwd = []
+        self.important_tks = []
+        self.create_time = ""
+        self.create_timestamp_flt = 0.0
+        self.kb_id = None
+        self.docnm_kwd = ""
+        self.doc_id = ""
+        self.q_vec = []
+        self.status = "1"
+        for k, v in res_dict.items():
+            if hasattr(self, k):
+                setattr(self, k, v)
+        super().__init__(rag, res_dict)
+    def delete(self) -> bool:
+        """
+        Delete the chunk in the document.
+        """
+        res = self.rm('/doc/chunk/rm',
+                      {"doc_id": [self.id],""})
+        res = res.json()
+        if res.get("retmsg") == "success":
+            return True
+        raise Exception(res["retmsg"])

sdk/python/ragflow/modules/document.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .base import Base
 class Document(Base):
@@ -21,6 +22,8 @@ class Document(Base):
         self.progress_msg = ""
         self.process_begin_at = None
         self.process_duration = 0.0
         for k in list(res_dict.keys()):
             if k not in self.__dict__:
                 res_dict.pop(k)
@@ -61,7 +64,7 @@ class Document(Base):
         :return: The downloaded document content in bytes.
         """
         # Construct the URL for the API request using the document ID and knowledge base ID
-        res = self.get(f"/doc/{self.kb_id}/documents/{self.id}",
                        {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
         # Check the response status code to ensure the request was successful
@@ -73,3 +76,121 @@ class Document(Base):
             raise Exception(
                 f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
             )

+import time
 from .base import Base
+from .chunk import Chunk
 class Document(Base):
         self.progress_msg = ""
         self.process_begin_at = None
         self.process_duration = 0.0
+        self.run = "0"
+        self.status = "1"
         for k in list(res_dict.keys()):
             if k not in self.__dict__:
                 res_dict.pop(k)
         :return: The downloaded document content in bytes.
         """
         # Construct the URL for the API request using the document ID and knowledge base ID
+        res = self.get(f"/doc/{self.id}",
                        {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
         # Check the response status code to ensure the request was successful
             raise Exception(
                 f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
             )
+    def async_parse(self):
+        """
+        Initiate document parsing asynchronously without waiting for completion.
+        """
+        try:
+            # Construct request data including document ID and run status (assuming 1 means to run)
+            data = {"doc_ids": [self.id], "run": 1}
+            # Send a POST request to the specified parsing status endpoint to start parsing
+            res = self.post(f'/doc/run', data)
+            # Check the server response status code
+            if res.status_code != 200:
+                raise Exception(f"Failed to start async parsing: {res.text}")
+            print("Async parsing started successfully.")
+        except Exception as e:
+            # Catch and handle exceptions
+            print(f"Error occurred during async parsing: {str(e)}")
+            raise
+    import time
+    def join(self, interval=5, timeout=3600):
+        """
+        Wait for the asynchronous parsing to complete and yield parsing progress periodically.
+        :param interval: The time interval (in seconds) for progress reports.
+        :param timeout: The timeout (in seconds) for the parsing operation.
+        :return: An iterator yielding parsing progress and messages.
+        """
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            # Check the parsing status
+            res = self.get(f'/doc/{self.id}/status', {"doc_ids": [self.id]})
+            res_data = res.json()
+            data = res_data.get("data", [])
+            # Retrieve progress and status message
+            progress = data.get("progress", 0)
+            progress_msg = data.get("status", "")
+            yield progress, progress_msg  # Yield progress and message
+            if progress == 100:  # Parsing completed
+                break
+            time.sleep(interval)
+    def cancel(self):
+        """
+        Cancel the parsing task for the document.
+        """
+        try:
+            # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
+            data = {"doc_ids": [self.id], "run": 2}
+            # Send a POST request to the specified parsing status endpoint to cancel parsing
+            res = self.post(f'/doc/run', data)
+            # Check the server response status code
+            if res.status_code != 200:
+                print("Failed to cancel parsing. Server response:", res.text)
+            else:
+                print("Parsing cancelled successfully.")
+        except Exception as e:
+            print(f"Error occurred during async parsing cancellation: {str(e)}")
+            raise
+    def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
+        """
+        List all chunks associated with this document by calling the external API.
+        Args:
+            page (int): The page number to retrieve (default 1).
+            size (int): The number of chunks per page (default 30).
+            keywords (str): Keywords for searching specific chunks (default "").
+            available_int (int): Filter for available chunks (optional).
+        Returns:
+            list: A list of chunks returned from the API.
+        """
+        data = {
+            "doc_id": self.id,
+            "page": page,
+            "size": size,
+            "keywords": keywords,
+            "offset":offset,
+            "limit":limit
+        }
+        if available_int is not None:
+            data["available_int"] = available_int
+        res = self.post(f'/doc/chunk/list', data)
+        if res.status_code == 200:
+            res_data = res.json()
+            if res_data.get("retmsg") == "success":
+                chunks = res_data["data"]["chunks"]
+                self.chunks = chunks  # Store the chunks in the document instance
+                return chunks
+            else:
+                raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
+        else:
+            raise Exception(f"API request failed with status code {res.status_code}")
+    def add_chunk(self, content: str):
+        res = self.post('/doc/chunk/create', {"doc_id": self.id, "content_with_weight":content})
+        # 假设返回的 response 包含 chunk 的信息
+        if res.status_code == 200:
+            chunk_data = res.json()
+            return Chunk(self.rag,chunk_data)  # 假设有一个 Chunk 类来处理 chunk 对象
+        else:
+            raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")

sdk/python/ragflow/ragflow.py CHANGED Viewed

@@ -171,3 +171,50 @@ class RAGFlow:
             return Document(self, res['data'])
         raise Exception(res["retmsg"])

             return Document(self, res['data'])
         raise Exception(res["retmsg"])
+    def async_parse_documents(self, doc_ids):
+        """
+        Asynchronously start parsing multiple documents without waiting for completion.
+        :param doc_ids: A list containing multiple document IDs.
+        """
+        try:
+            if not doc_ids or not isinstance(doc_ids, list):
+                raise ValueError("doc_ids must be a non-empty list of document IDs")
+            data = {"doc_ids": doc_ids, "run": 1}
+            res = self.post(f'/doc/run', data)
+            if res.status_code != 200:
+                raise Exception(f"Failed to start async parsing for documents: {res.text}")
+            print(f"Async parsing started successfully for documents: {doc_ids}")
+        except Exception as e:
+            print(f"Error occurred during async parsing for documents: {str(e)}")
+            raise
+    def async_cancel_parse_documents(self, doc_ids):
+        """
+        Cancel the asynchronous parsing of multiple documents.
+        :param doc_ids: A list containing multiple document IDs.
+        """
+        try:
+            if not doc_ids or not isinstance(doc_ids, list):
+                raise ValueError("doc_ids must be a non-empty list of document IDs")
+            data = {"doc_ids": doc_ids, "run": 2}
+            res = self.post(f'/doc/run', data)
+            if res.status_code != 200:
+                raise Exception(f"Failed to cancel async parsing for documents: {res.text}")
+            print(f"Async parsing canceled successfully for documents: {doc_ids}")
+        except Exception as e:
+            print(f"Error occurred during canceling parsing for documents: {str(e)}")
+            raise

sdk/python/test/t_document.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from ragflow import RAGFlow, DataSet, Document
 from common import API_KEY, HOST_ADDRESS
 from test_sdkbase import TestSdk
@@ -46,6 +46,7 @@ class TestDocument(TestSdk):
         doc = rag.get_document(name="TestDocument.txt")
         if isinstance(doc, Document):
             doc.parser_method = "manual"
             res = doc.save()
             assert res is True, f"Failed to update document, error: {res}"
         else:
@@ -126,8 +127,8 @@ class TestDocument(TestSdk):
         blob1 = b"Sample document content for ingestion test333."
         name2 = "Test Document444.txt"
         blob2 = b"Sample document content for ingestion test444."
-        name3='test.txt'
-        path='test_data/test.txt'
         rag.create_document(ds, name=name3, blob=open(path, "rb").read())
         rag.create_document(ds, name=name1, blob=blob1)
         rag.create_document(ds, name=name2, blob=blob2)
@@ -138,7 +139,131 @@ class TestDocument(TestSdk):
         remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
         assert len(remaining_docs) == 0, "Documents were not properly deleted."

+from ragflow import RAGFlow, DataSet, Document, Chunk
 from common import API_KEY, HOST_ADDRESS
 from test_sdkbase import TestSdk
         doc = rag.get_document(name="TestDocument.txt")
         if isinstance(doc, Document):
             doc.parser_method = "manual"
+            doc.name = "manual.txt"
             res = doc.save()
             assert res is True, f"Failed to update document, error: {res}"
         else:
         blob1 = b"Sample document content for ingestion test333."
         name2 = "Test Document444.txt"
         blob2 = b"Sample document content for ingestion test444."
+        name3 = 'test.txt'
+        path = 'test_data/test.txt'
         rag.create_document(ds, name=name3, blob=open(path, "rb").read())
         rag.create_document(ds, name=name1, blob=blob1)
         rag.create_document(ds, name=name2, blob=blob2)
         remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
         assert len(remaining_docs) == 0, "Documents were not properly deleted."
+    def test_parse_and_cancel_document(self):
+        # Initialize RAGFlow with API key and host address
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        # Create a dataset with a specific name
+        ds = rag.create_dataset(name="God4")
+        # Define the document name and path
+        name3 = 'ai.pdf'
+        path = 'test_data/ai.pdf'
+        # Create a document in the dataset using the file path
+        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
+        # Retrieve the document by name
+        doc = rag.get_document(name="ai.pdf")
+        # Initiate asynchronous parsing
+        doc.async_parse()
+        # Print message to confirm asynchronous parsing has been initiated
+        print("Async parsing initiated")
+        # Use join to wait for parsing to complete and get progress updates
+        for progress, msg in doc.join(interval=5, timeout=10):
+            print(progress, msg)
+            # Assert that the progress is within the valid range (0 to 100)
+            assert 0 <= progress <= 100, f"Invalid progress: {progress}"
+            # Assert that the message is not empty
+            assert msg, "Message should not be empty"
+            # Test cancelling the parsing operation
+        doc.cancel()
+        # Print message to confirm parsing has been cancelled successfully
+        print("Parsing cancelled successfully")
+    def test_bulk_parse_and_cancel_documents(self):
+        # Initialize RAGFlow with API key and host address
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        # Create a dataset
+        ds = rag.create_dataset(name="God5")
+        assert ds is not None, "Dataset creation failed"
+        assert ds.name == "God5", "Dataset name does not match"
+        # Prepare a list of file names and paths
+        documents = [
+            {'name': 'ai1.pdf', 'path': 'test_data/ai1.pdf'},
+            {'name': 'ai2.pdf', 'path': 'test_data/ai2.pdf'},
+            {'name': 'ai3.pdf', 'path': 'test_data/ai3.pdf'}
+        ]
+        # Create documents in bulk
+        for doc_info in documents:
+            with open(doc_info['path'], "rb") as file:
+                created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
+                assert created_doc is not None, f"Failed to create document {doc_info['name']}"
+                # Retrieve document objects in bulk
+        docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
+        ids = [doc.id for doc in docs]
+        assert len(docs) == len(documents), "Mismatch between created documents and fetched documents"
+        # Initiate asynchronous parsing for all documents
+        rag.async_parse_documents(ids)
+        print("Async bulk parsing initiated")
+        # Wait for all documents to finish parsing and check progress
+        for doc in docs:
+            for progress, msg in doc.join(interval=5, timeout=10):
+                print(f"{doc.name}: Progress: {progress}, Message: {msg}")
+                # Assert that progress is within the valid range
+                assert 0 <= progress <= 100, f"Invalid progress: {progress} for document {doc.name}"
+                # Assert that the message is not empty
+                assert msg, f"Message should not be empty for document {doc.name}"
+                # If progress reaches 100%, assert that parsing is completed successfully
+                if progress == 100:
+                    assert "completed" in msg.lower(), f"Document {doc.name} did not complete successfully"
+                    # Cancel parsing for all documents in bulk
+        cancel_result = rag.async_cancel_parse_documents(ids)
+        assert cancel_result is None or isinstance(cancel_result, type(None)), "Failed to cancel document parsing"
+        print("Async bulk parsing cancelled")
+    def test_parse_document_and_chunk_list(self):
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        ds = rag.create_dataset(name="God7")
+        name='story.txt'
+        path = 'test_data/story.txt'
+        # name = "Test Document rag.txt"
+        # blob = " Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps.Sample document content for rag test66. rag wonderful apple os documents apps.Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps.  Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps."
+        rag.create_document(ds, name=name, blob=open(path, "rb").read())
+        doc = rag.get_document(name=name)
+        doc.async_parse()
+        # Wait for parsing to complete and get progress updates using join
+        for progress, msg in doc.join(interval=5, timeout=30):
+            print(progress, msg)
+            # Assert that progress is within 0 to 100
+            assert 0 <= progress <= 100, f"Invalid progress: {progress}"
+            # Assert that the message is not empty
+            assert msg, "Message should not be empty"
+        for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
+            print(c)
+            assert c is not None, "Chunk is None"
+            assert "rag" in c['content_with_weight'].lower(), f"Keyword 'rag' not found in chunk content: {c.content}"
+    def test_add_chunk_to_chunk_list(self):
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        doc = rag.get_document(name='story.txt')
+        chunk = doc.add_chunk(content="assss")
+        assert chunk is not None, "Chunk is None"
+        assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
+    def test_delete_chunk_of_chunk_list(self):
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        doc = rag.get_document(name='story.txt')
+        chunk = doc.add_chunk(content="assss")
+        assert chunk is not None, "Chunk is None"
+        assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
+        chunk_num_before=doc.chunk_num
+        chunk.delete()
+        assert doc.chunk_num == chunk_num_before-1, "Chunk was not deleted"

sdk/python/test/test_data/ragflow_test.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+Introducing RagFlow: Revolutionizing Natural Language Processing with Retrieval-Augmented Generation
+In the ever-evolving landscape of Natural Language Processing (NLP), new techniques and frameworks continue to push the boundaries of what machines can understand and generate from human language. Among these innovative advancements, RagFlow stands out as a pioneering approach that combines the power of retrieval and generation to revolutionize the way we interact with text-based data.
+What is RagFlow?
+RagFlow, short for Retrieval-Augmented Generation Flow, is a framework designed to enhance the capabilities of NLP models by integrating a retrieval component into the generation process. This approach leverages large-scale knowledge bases and text corpora to retrieve relevant information that can inform and enrich the output generated by the model. By doing so, RagFlow enables models to produce more accurate, informative, and contextually relevant responses, surpassing the limitations of traditional generation-only or retrieval-only systems.
+The Core Concept
+At its core, RagFlow operates on two fundamental principles:
+Retrieval: The first step involves identifying and retrieving relevant information from a vast collection of text sources. This can include web pages, academic articles, books, or any other form of unstructured text data. RagFlow employs advanced retrieval algorithms, often based on neural networks and vector similarity, to quickly and accurately locate the most pertinent information for a given query or task.
+Generation: Once relevant information has been retrieved, RagFlow leverages generative NLP models to produce the final output. These models, such as transformers or GPT-like architectures, are trained to understand the context provided by the retrieved information and generate coherent, fluent text that incorporates this knowledge. The integration of retrieval and generation allows RagFlow to generate responses that are not only grammatically correct but also semantically rich and contextually appropriate.
+Advantages of RagFlow
+Increased Accuracy and Relevance: By incorporating retrieved information, RagFlow can generate responses that are more accurate and relevant to the user's query or task. This is particularly useful in domains where factual accuracy and contextual relevance are crucial, such as question answering, summarization, and knowledge-intensive dialogue systems.
+Scalability and Flexibility: RagFlow's reliance on large-scale text corpora and retrieval algorithms makes it highly scalable to new domains and datasets. As more data becomes available, the retrieval component can be easily updated to incorporate new information, while the generative model can be fine-tuned to adapt to specific tasks or user preferences.
+Improved Efficiency: By leveraging pre-existing knowledge bases and retrieval algorithms, RagFlow can reduce the computational burden on the generative model. This allows the model to focus on generating high-quality output rather than searching for relevant information from scratch, resulting in improved efficiency and faster response times.
+Applications and Future Directions
+RagFlow has the potential to transform a wide range of NLP applications, including but not limited to:
+Question Answering Systems: By retrieving relevant passages and generating precise answers, RagFlow can enhance the accuracy and comprehensiveness of question answering systems.
+Document Summarization: By identifying key information and generating concise summaries, RagFlow can help users quickly grasp the main points of lengthy documents.
+Creative Writing and Storytelling: By incorporating retrieved elements into the generation process, RagFlow can inspire and augment creative writing, enabling machines to produce more engaging and original stories.
+As the field of NLP continues to evolve, RagFlow represents a promising direction for leveraging the power of both retrieval and generation. With further research and development, we can expect to see even more sophisticated and versatile RagFlow-based systems that push the boundaries of what machines can achieve with human language.

sdk/python/test/test_data/story.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Once upon a time, in a small village nestled at the foot of a towering mountain, lived a young girl named Lily. Lily had a heart as pure as the mountain's snowcaps and a spirit as adventurous as the winding trails that led to its peak.
+One day, as Lily was gathering berries in the forest's edge, she stumbled upon an old, weathered map hidden beneath a fallen tree. The map was covered in strange symbols and a single, glowing word: "Treasure." Curiousity piqued, Lily decided to embark on a quest to uncover the mystery of the treasure.
+With nothing more than her trusty basket of berries, a few pieces of bread, and the map, Lily set off into the unknown. As she climbed higher and higher into the mountains, the air grew crisp, and the scenery transformed into a breathtaking tapestry of lush greenery and sparkling streams.
+Along the way, Lily encountered all sorts of challenges. She had to navigate treacherous rivers using fallen logs as bridges, climb steep cliffs with nothing but her agility and determination, and even outsmart a mischievous pack of foxes that tried to lead her astray. But through it all, Lily remained steadfast, her heart filled with hope and a sense of purpose.
+Finally, after what seemed like an eternity of trekking, Lily arrived at a hidden valley. At its center stood an ancient tree, its roots entwined with glittering jewels and a chest made of pure gold. This, the map had revealed, was the source of the treasure.
+But as Lily approached the chest, she realized that the true treasure was not the riches before her. It was the journey itself—the friendships she had forged with the animals she encountered, the strength she had gained from overcoming obstacles, and the sense of wonder and discovery that filled her heart.
+With a smile on her face, Lily gently closed the chest and left it where it was, content in the knowledge that the greatest treasures in life are not always found in gold or jewels. She turned back towards home, her heart full of stories to share and a spirit that had been forever changed by her adventure.
+And so, Lily returned to her village, a hero in her own right, with a tale that would be whispered around firesides for generations to come.

sdk/python/test/test_data/test1.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 test1
-test1

 test1
+test1
+aaaa document  args arg
+rag document

sdk/python/test/test_data/test2.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+test22
+test22
+aaaa document  args arg
+rag document

sdk/python/test/test_data/test3.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+test3
+test333
+aaaa document  args arg
+rag document

sdk/python/test/test_data/westworld.pdf ADDED Viewed

Binary file (33.1 kB). View file