JobSmithManipulation Kevin Hu commited on
Commit
278278b
·
1 Parent(s): bac5213

update sdk document and chunk (#2421)

Browse files

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <[email protected]>

api/apps/sdk/doc.py CHANGED
@@ -1,19 +1,63 @@
1
- from io import BytesIO
 
 
 
 
 
 
 
 
2
 
3
- from flask import request,send_file
4
- from api.utils.api_utils import get_json_result, construct_json_result, server_error_response
 
 
 
 
 
 
 
 
 
 
 
 
5
  from api.utils.api_utils import get_json_result, token_required, get_data_error_result
6
- from api.db import FileType, ParserType, FileSource, TaskStatus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from api.db.db_models import File
8
  from api.db.services.document_service import DocumentService
9
  from api.db.services.file2document_service import File2DocumentService
10
  from api.db.services.file_service import FileService
11
  from api.db.services.knowledgebase_service import KnowledgebaseService
12
- from api.db.services.user_service import TenantService, UserTenantService
13
- from api.settings import RetCode
14
  from api.utils.api_utils import construct_json_result, construct_error_response
 
 
 
 
15
  from rag.utils.storage_factory import STORAGE_IMPL
16
 
 
 
 
 
17
 
18
  @manager.route('/dataset/<dataset_id>/documents/upload', methods=['POST'])
19
  @token_required
@@ -54,34 +98,169 @@ def docinfos(tenant_id):
54
  @manager.route('/save', methods=['POST'])
55
  @token_required
56
  def save_doc(tenant_id):
57
- req = request.json # Expecting JSON input
 
 
58
  if "id" in req:
59
  doc_id = req["id"]
60
- if "name" in req:
61
  doc_name = req["name"]
62
  doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
63
- data = request.json
64
- # Call the update method with the provided id and data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  try:
66
- num = DocumentService.update_by_id(doc_id, data)
67
- if num > 0:
68
- return get_json_result(retmsg="success", data={"updated_count": num})
69
- else:
70
- return get_json_result(retcode=404, retmsg="Document not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
- return get_json_result(retmsg=f"Error occurred: {str(e)}")
73
 
74
 
75
- @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
76
  @token_required
77
  def download_document(dataset_id, document_id):
78
  try:
79
- # Check whether there is this dataset
80
- exist, _ = KnowledgebaseService.get_by_id(dataset_id)
81
- if not exist:
82
- return construct_json_result(code=RetCode.DATA_ERROR,
83
- message=f"This dataset '{dataset_id}' cannot be found!")
84
-
85
  # Check whether there is this document
86
  exist, document = DocumentService.get_by_id(document_id)
87
  if not exist:
@@ -108,9 +287,10 @@ def download_document(dataset_id, document_id):
108
  except Exception as e:
109
  return construct_error_response(e)
110
 
 
111
  @manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
112
  @token_required
113
- def list_docs(dataset_id,tenant_id):
114
  kb_id = request.args.get("kb_id")
115
  if not kb_id:
116
  return get_json_result(
@@ -177,4 +357,172 @@ def rm(tenant_id):
177
  if errors:
178
  return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
179
 
180
- return get_json_result(data=True,retmsg="success")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import re
3
+ import datetime
4
+ import json
5
+ import traceback
6
+
7
+ from flask import request
8
+ from flask_login import login_required, current_user
9
+ from elasticsearch_dsl import Q
10
 
11
+ from rag.app.qa import rmPrefix, beAdoc
12
+ from rag.nlp import search, rag_tokenizer, keyword_extraction
13
+ from rag.utils.es_conn import ELASTICSEARCH
14
+ from rag.utils import rmSpace
15
+ from api.db import LLMType, ParserType
16
+ from api.db.services.knowledgebase_service import KnowledgebaseService
17
+ from api.db.services.llm_service import TenantLLMService
18
+ from api.db.services.user_service import UserTenantService
19
+ from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
20
+ from api.db.services.document_service import DocumentService
21
+ from api.settings import RetCode, retrievaler, kg_retrievaler
22
+ from api.utils.api_utils import get_json_result
23
+ import hashlib
24
+ import re
25
  from api.utils.api_utils import get_json_result, token_required, get_data_error_result
26
+
27
+ from api.db.db_models import Task, File
28
+
29
+ from api.db.services.task_service import TaskService, queue_tasks
30
+ from api.db.services.user_service import TenantService, UserTenantService
31
+
32
+ from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
33
+
34
+ from api.utils.api_utils import get_json_result
35
+
36
+ from functools import partial
37
+ from io import BytesIO
38
+
39
+ from elasticsearch_dsl import Q
40
+ from flask import request, send_file
41
+ from flask_login import login_required
42
+
43
+ from api.db import FileSource, TaskStatus, FileType
44
  from api.db.db_models import File
45
  from api.db.services.document_service import DocumentService
46
  from api.db.services.file2document_service import File2DocumentService
47
  from api.db.services.file_service import FileService
48
  from api.db.services.knowledgebase_service import KnowledgebaseService
49
+ from api.settings import RetCode, retrievaler
 
50
  from api.utils.api_utils import construct_json_result, construct_error_response
51
+ from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
52
+ from rag.nlp import search
53
+ from rag.utils import rmSpace
54
+ from rag.utils.es_conn import ELASTICSEARCH
55
  from rag.utils.storage_factory import STORAGE_IMPL
56
 
57
+ MAXIMUM_OF_UPLOADING_FILES = 256
58
+
59
+ MAXIMUM_OF_UPLOADING_FILES = 256
60
+
61
 
62
  @manager.route('/dataset/<dataset_id>/documents/upload', methods=['POST'])
63
  @token_required
 
98
  @manager.route('/save', methods=['POST'])
99
  @token_required
100
  def save_doc(tenant_id):
101
+ req = request.json
102
+ #get doc by id or name
103
+ doc_id = None
104
  if "id" in req:
105
  doc_id = req["id"]
106
+ elif "name" in req:
107
  doc_name = req["name"]
108
  doc_id = DocumentService.get_doc_id_by_doc_name(doc_name)
109
+ if not doc_id:
110
+ return get_json_result(retcode=400, retmsg="Document ID or name is required")
111
+ e, doc = DocumentService.get_by_id(doc_id)
112
+ if not e:
113
+ return get_data_error_result(retmsg="Document not found!")
114
+ #other value can't be changed
115
+ if "chunk_num" in req:
116
+ if req["chunk_num"] != doc.chunk_num:
117
+ return get_data_error_result(
118
+ retmsg="Can't change chunk_count.")
119
+ if "progress" in req:
120
+ if req['progress'] != doc.progress:
121
+ return get_data_error_result(
122
+ retmsg="Can't change progress.")
123
+ #change name or parse_method
124
+ if "name" in req and req["name"] != doc.name:
125
+ try:
126
+ if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
127
+ doc.name.lower()).suffix:
128
+ return get_json_result(
129
+ data=False,
130
+ retmsg="The extension of file can't be changed",
131
+ retcode=RetCode.ARGUMENT_ERROR)
132
+ for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
133
+ if d.name == req["name"]:
134
+ return get_data_error_result(
135
+ retmsg="Duplicated document name in the same knowledgebase.")
136
+
137
+ if not DocumentService.update_by_id(
138
+ doc_id, {"name": req["name"]}):
139
+ return get_data_error_result(
140
+ retmsg="Database error (Document rename)!")
141
+
142
+ informs = File2DocumentService.get_by_document_id(doc_id)
143
+ if informs:
144
+ e, file = FileService.get_by_id(informs[0].file_id)
145
+ FileService.update_by_id(file.id, {"name": req["name"]})
146
+ except Exception as e:
147
+ return server_error_response(e)
148
+ if "parser_id" in req:
149
+ try:
150
+ if doc.parser_id.lower() == req["parser_id"].lower():
151
+ if "parser_config" in req:
152
+ if req["parser_config"] == doc.parser_config:
153
+ return get_json_result(data=True)
154
+ else:
155
+ return get_json_result(data=True)
156
+
157
+ if doc.type == FileType.VISUAL or re.search(
158
+ r"\.(ppt|pptx|pages)$", doc.name):
159
+ return get_data_error_result(retmsg="Not supported yet!")
160
+
161
+ e = DocumentService.update_by_id(doc.id,
162
+ {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
163
+ "run": TaskStatus.UNSTART.value})
164
+ if not e:
165
+ return get_data_error_result(retmsg="Document not found!")
166
+ if "parser_config" in req:
167
+ DocumentService.update_parser_config(doc.id, req["parser_config"])
168
+ if doc.token_num > 0:
169
+ e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
170
+ doc.process_duation * -1)
171
+ if not e:
172
+ return get_data_error_result(retmsg="Document not found!")
173
+ tenant_id = DocumentService.get_tenant_id(req["doc_id"])
174
+ if not tenant_id:
175
+ return get_data_error_result(retmsg="Tenant not found!")
176
+ ELASTICSEARCH.deleteByQuery(
177
+ Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
178
+ except Exception as e:
179
+ return server_error_response(e)
180
+ return get_json_result(data=True)
181
+
182
+
183
+
184
+ @manager.route('/change_parser', methods=['POST'])
185
+ @token_required
186
+ def change_parser(tenant_id):
187
+ req = request.json
188
+ try:
189
+ e, doc = DocumentService.get_by_id(req["doc_id"])
190
+ if not e:
191
+ return get_data_error_result(retmsg="Document not found!")
192
+ if doc.parser_id.lower() == req["parser_id"].lower():
193
+ if "parser_config" in req:
194
+ if req["parser_config"] == doc.parser_config:
195
+ return get_json_result(data=True)
196
+ else:
197
+ return get_json_result(data=True)
198
+
199
+ if doc.type == FileType.VISUAL or re.search(
200
+ r"\.(ppt|pptx|pages)$", doc.name):
201
+ return get_data_error_result(retmsg="Not supported yet!")
202
+
203
+ e = DocumentService.update_by_id(doc.id,
204
+ {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
205
+ "run": TaskStatus.UNSTART.value})
206
+ if not e:
207
+ return get_data_error_result(retmsg="Document not found!")
208
+ if "parser_config" in req:
209
+ DocumentService.update_parser_config(doc.id, req["parser_config"])
210
+ if doc.token_num > 0:
211
+ e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
212
+ doc.process_duation * -1)
213
+ if not e:
214
+ return get_data_error_result(retmsg="Document not found!")
215
+ tenant_id = DocumentService.get_tenant_id(req["doc_id"])
216
+ if not tenant_id:
217
+ return get_data_error_result(retmsg="Tenant not found!")
218
+ ELASTICSEARCH.deleteByQuery(
219
+ Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
220
+
221
+ return get_json_result(data=True)
222
+ except Exception as e:
223
+ return server_error_response(e)
224
+
225
+ @manager.route('/rename', methods=['POST'])
226
+ @login_required
227
+ @validate_request("doc_id", "name")
228
+ def rename():
229
+ req = request.json
230
  try:
231
+ e, doc = DocumentService.get_by_id(req["doc_id"])
232
+ if not e:
233
+ return get_data_error_result(retmsg="Document not found!")
234
+ if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
235
+ doc.name.lower()).suffix:
236
+ return get_json_result(
237
+ data=False,
238
+ retmsg="The extension of file can't be changed",
239
+ retcode=RetCode.ARGUMENT_ERROR)
240
+ for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
241
+ if d.name == req["name"]:
242
+ return get_data_error_result(
243
+ retmsg="Duplicated document name in the same knowledgebase.")
244
+
245
+ if not DocumentService.update_by_id(
246
+ req["doc_id"], {"name": req["name"]}):
247
+ return get_data_error_result(
248
+ retmsg="Database error (Document rename)!")
249
+
250
+ informs = File2DocumentService.get_by_document_id(req["doc_id"])
251
+ if informs:
252
+ e, file = FileService.get_by_id(informs[0].file_id)
253
+ FileService.update_by_id(file.id, {"name": req["name"]})
254
+
255
+ return get_json_result(data=True)
256
  except Exception as e:
257
+ return server_error_response(e)
258
 
259
 
260
+ @manager.route("/<document_id>", methods=["GET"])
261
  @token_required
262
  def download_document(dataset_id, document_id):
263
  try:
 
 
 
 
 
 
264
  # Check whether there is this document
265
  exist, document = DocumentService.get_by_id(document_id)
266
  if not exist:
 
287
  except Exception as e:
288
  return construct_error_response(e)
289
 
290
+
291
  @manager.route('/dataset/<dataset_id>/documents', methods=['GET'])
292
  @token_required
293
+ def list_docs(dataset_id, tenant_id):
294
  kb_id = request.args.get("kb_id")
295
  if not kb_id:
296
  return get_json_result(
 
357
  if errors:
358
  return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
359
 
360
+ return get_json_result(data=True, retmsg="success")
361
+
362
+ @manager.route("/<document_id>/status", methods=["GET"])
363
+ @token_required
364
+ def show_parsing_status(tenant_id, document_id):
365
+ try:
366
+ # valid document
367
+ exist, _ = DocumentService.get_by_id(document_id)
368
+ if not exist:
369
+ return construct_json_result(code=RetCode.DATA_ERROR,
370
+ message=f"This document: '{document_id}' is not a valid document.")
371
+
372
+ _, doc = DocumentService.get_by_id(document_id) # get doc object
373
+ doc_attributes = doc.to_dict()
374
+
375
+ return construct_json_result(
376
+ data={"progress": doc_attributes["progress"], "status": TaskStatus(doc_attributes["status"]).name},
377
+ code=RetCode.SUCCESS
378
+ )
379
+ except Exception as e:
380
+ return construct_error_response(e)
381
+
382
+
383
+
384
+ @manager.route('/run', methods=['POST'])
385
+ @token_required
386
+ def run(tenant_id):
387
+ req = request.json
388
+ try:
389
+ for id in req["doc_ids"]:
390
+ info = {"run": str(req["run"]), "progress": 0}
391
+ if str(req["run"]) == TaskStatus.RUNNING.value:
392
+ info["progress_msg"] = ""
393
+ info["chunk_num"] = 0
394
+ info["token_num"] = 0
395
+ DocumentService.update_by_id(id, info)
396
+ # if str(req["run"]) == TaskStatus.CANCEL.value:
397
+ tenant_id = DocumentService.get_tenant_id(id)
398
+ if not tenant_id:
399
+ return get_data_error_result(retmsg="Tenant not found!")
400
+ ELASTICSEARCH.deleteByQuery(
401
+ Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
402
+
403
+ if str(req["run"]) == TaskStatus.RUNNING.value:
404
+ TaskService.filter_delete([Task.doc_id == id])
405
+ e, doc = DocumentService.get_by_id(id)
406
+ doc = doc.to_dict()
407
+ doc["tenant_id"] = tenant_id
408
+ bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
409
+ queue_tasks(doc, bucket, name)
410
+
411
+ return get_json_result(data=True)
412
+ except Exception as e:
413
+ return server_error_response(e)
414
+
415
+
416
+ @manager.route('/chunk/list', methods=['POST'])
417
+ @token_required
418
+ @validate_request("doc_id")
419
+ def list_chunk(tenant_id):
420
+ req = request.json
421
+ doc_id = req["doc_id"]
422
+ page = int(req.get("page", 1))
423
+ size = int(req.get("size", 30))
424
+ question = req.get("keywords", "")
425
+ try:
426
+ tenant_id = DocumentService.get_tenant_id(req["doc_id"])
427
+ if not tenant_id:
428
+ return get_data_error_result(retmsg="Tenant not found!")
429
+ e, doc = DocumentService.get_by_id(doc_id)
430
+ if not e:
431
+ return get_data_error_result(retmsg="Document not found!")
432
+ query = {
433
+ "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
434
+ }
435
+ if "available_int" in req:
436
+ query["available_int"] = int(req["available_int"])
437
+ sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
438
+ res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
439
+ for id in sres.ids:
440
+ d = {
441
+ "chunk_id": id,
442
+ "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
443
+ id].get(
444
+ "content_with_weight", ""),
445
+ "doc_id": sres.field[id]["doc_id"],
446
+ "docnm_kwd": sres.field[id]["docnm_kwd"],
447
+ "important_kwd": sres.field[id].get("important_kwd", []),
448
+ "img_id": sres.field[id].get("img_id", ""),
449
+ "available_int": sres.field[id].get("available_int", 1),
450
+ "positions": sres.field[id].get("position_int", "").split("\t")
451
+ }
452
+ if len(d["positions"]) % 5 == 0:
453
+ poss = []
454
+ for i in range(0, len(d["positions"]), 5):
455
+ poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
456
+ float(d["positions"][i + 3]), float(d["positions"][i + 4])])
457
+ d["positions"] = poss
458
+ res["chunks"].append(d)
459
+ return get_json_result(data=res)
460
+ except Exception as e:
461
+ if str(e).find("not_found") > 0:
462
+ return get_json_result(data=False, retmsg=f'No chunk found!',
463
+ retcode=RetCode.DATA_ERROR)
464
+ return server_error_response(e)
465
+
466
+
467
+ @manager.route('/chunk/create', methods=['POST'])
468
+ @token_required
469
+ @validate_request("doc_id", "content_with_weight")
470
+ def create(tenant_id):
471
+ req = request.json
472
+ md5 = hashlib.md5()
473
+ md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
474
+ chunck_id = md5.hexdigest()
475
+ d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
476
+ "content_with_weight": req["content_with_weight"]}
477
+ d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
478
+ d["important_kwd"] = req.get("important_kwd", [])
479
+ d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
480
+ d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
481
+ d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
482
+
483
+ try:
484
+ e, doc = DocumentService.get_by_id(req["doc_id"])
485
+ if not e:
486
+ return get_data_error_result(retmsg="Document not found!")
487
+ d["kb_id"] = [doc.kb_id]
488
+ d["docnm_kwd"] = doc.name
489
+ d["doc_id"] = doc.id
490
+
491
+ tenant_id = DocumentService.get_tenant_id(req["doc_id"])
492
+ if not tenant_id:
493
+ return get_data_error_result(retmsg="Tenant not found!")
494
+
495
+ embd_id = DocumentService.get_embd_id(req["doc_id"])
496
+ embd_mdl = TenantLLMService.model_instance(
497
+ tenant_id, LLMType.EMBEDDING.value, embd_id)
498
+
499
+ v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
500
+ v = 0.1 * v[0] + 0.9 * v[1]
501
+ d["q_%d_vec" % len(v)] = v.tolist()
502
+ ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
503
+
504
+ DocumentService.increment_chunk_num(
505
+ doc.id, doc.kb_id, c, 1, 0)
506
+ return get_json_result(data={"chunk": d})
507
+ # return get_json_result(data={"chunk_id": chunck_id})
508
+ except Exception as e:
509
+ return server_error_response(e)
510
+
511
+ @manager.route('/chunk/rm', methods=['POST'])
512
+ @token_required
513
+ @validate_request("chunk_ids", "doc_id")
514
+ def rm():
515
+ req = request.json
516
+ try:
517
+ if not ELASTICSEARCH.deleteByQuery(
518
+ Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
519
+ return get_data_error_result(retmsg="Index updating failure")
520
+ e, doc = DocumentService.get_by_id(req["doc_id"])
521
+ if not e:
522
+ return get_data_error_result(retmsg="Document not found!")
523
+ deleted_chunk_ids = req["chunk_ids"]
524
+ chunk_number = len(deleted_chunk_ids)
525
+ DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
526
+ return get_json_result(data=True)
527
+ except Exception as e:
528
+ return server_error_response(e)
sdk/python/ragflow/__init__.py CHANGED
@@ -6,4 +6,5 @@ from .ragflow import RAGFlow
6
  from .modules.dataset import DataSet
7
  from .modules.assistant import Assistant
8
  from .modules.session import Session
9
- from .modules.document import Document
 
 
6
  from .modules.dataset import DataSet
7
  from .modules.assistant import Assistant
8
  from .modules.session import Session
9
+ from .modules.document import Document
10
+ from .modules.chunk import Chunk
sdk/python/ragflow/modules/chunk.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import Base
2
+
3
+
4
+ class Chunk(Base):
5
+ def __init__(self, rag, res_dict):
6
+ # 初始化类的属性
7
+ self.id = ""
8
+ self.content_with_weight = ""
9
+ self.content_ltks = []
10
+ self.content_sm_ltks = []
11
+ self.important_kwd = []
12
+ self.important_tks = []
13
+ self.create_time = ""
14
+ self.create_timestamp_flt = 0.0
15
+ self.kb_id = None
16
+ self.docnm_kwd = ""
17
+ self.doc_id = ""
18
+ self.q_vec = []
19
+ self.status = "1"
20
+ for k, v in res_dict.items():
21
+ if hasattr(self, k):
22
+ setattr(self, k, v)
23
+
24
+ super().__init__(rag, res_dict)
25
+ def delete(self) -> bool:
26
+ """
27
+ Delete the chunk in the document.
28
+ """
29
+ res = self.rm('/doc/chunk/rm',
30
+ {"doc_id": [self.id],""})
31
+ res = res.json()
32
+ if res.get("retmsg") == "success":
33
+ return True
34
+ raise Exception(res["retmsg"])
sdk/python/ragflow/modules/document.py CHANGED
@@ -1,6 +1,7 @@
 
1
 
2
  from .base import Base
3
-
4
 
5
 
6
  class Document(Base):
@@ -21,6 +22,8 @@ class Document(Base):
21
  self.progress_msg = ""
22
  self.process_begin_at = None
23
  self.process_duration = 0.0
 
 
24
  for k in list(res_dict.keys()):
25
  if k not in self.__dict__:
26
  res_dict.pop(k)
@@ -61,7 +64,7 @@ class Document(Base):
61
  :return: The downloaded document content in bytes.
62
  """
63
  # Construct the URL for the API request using the document ID and knowledge base ID
64
- res = self.get(f"/doc/{self.kb_id}/documents/{self.id}",
65
  {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
66
 
67
  # Check the response status code to ensure the request was successful
@@ -73,3 +76,121 @@ class Document(Base):
73
  raise Exception(
74
  f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
75
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
 
3
  from .base import Base
4
+ from .chunk import Chunk
5
 
6
 
7
  class Document(Base):
 
22
  self.progress_msg = ""
23
  self.process_begin_at = None
24
  self.process_duration = 0.0
25
+ self.run = "0"
26
+ self.status = "1"
27
  for k in list(res_dict.keys()):
28
  if k not in self.__dict__:
29
  res_dict.pop(k)
 
64
  :return: The downloaded document content in bytes.
65
  """
66
  # Construct the URL for the API request using the document ID and knowledge base ID
67
+ res = self.get(f"/doc/{self.id}",
68
  {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
69
 
70
  # Check the response status code to ensure the request was successful
 
76
  raise Exception(
77
  f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
78
  )
79
+
80
+ def async_parse(self):
81
+ """
82
+ Initiate document parsing asynchronously without waiting for completion.
83
+ """
84
+ try:
85
+ # Construct request data including document ID and run status (assuming 1 means to run)
86
+ data = {"doc_ids": [self.id], "run": 1}
87
+
88
+ # Send a POST request to the specified parsing status endpoint to start parsing
89
+ res = self.post(f'/doc/run', data)
90
+
91
+ # Check the server response status code
92
+ if res.status_code != 200:
93
+ raise Exception(f"Failed to start async parsing: {res.text}")
94
+
95
+ print("Async parsing started successfully.")
96
+
97
+ except Exception as e:
98
+ # Catch and handle exceptions
99
+ print(f"Error occurred during async parsing: {str(e)}")
100
+ raise
101
+
102
+ import time
103
+
104
+ def join(self, interval=5, timeout=3600):
105
+ """
106
+ Wait for the asynchronous parsing to complete and yield parsing progress periodically.
107
+
108
+ :param interval: The time interval (in seconds) for progress reports.
109
+ :param timeout: The timeout (in seconds) for the parsing operation.
110
+ :return: An iterator yielding parsing progress and messages.
111
+ """
112
+ start_time = time.time()
113
+ while time.time() - start_time < timeout:
114
+ # Check the parsing status
115
+ res = self.get(f'/doc/{self.id}/status', {"doc_ids": [self.id]})
116
+ res_data = res.json()
117
+ data = res_data.get("data", [])
118
+
119
+ # Retrieve progress and status message
120
+ progress = data.get("progress", 0)
121
+ progress_msg = data.get("status", "")
122
+
123
+ yield progress, progress_msg # Yield progress and message
124
+
125
+ if progress == 100: # Parsing completed
126
+ break
127
+
128
+ time.sleep(interval)
129
+
130
+ def cancel(self):
131
+ """
132
+ Cancel the parsing task for the document.
133
+ """
134
+ try:
135
+ # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
136
+ data = {"doc_ids": [self.id], "run": 2}
137
+
138
+ # Send a POST request to the specified parsing status endpoint to cancel parsing
139
+ res = self.post(f'/doc/run', data)
140
+
141
+ # Check the server response status code
142
+ if res.status_code != 200:
143
+ print("Failed to cancel parsing. Server response:", res.text)
144
+ else:
145
+ print("Parsing cancelled successfully.")
146
+
147
+ except Exception as e:
148
+ print(f"Error occurred during async parsing cancellation: {str(e)}")
149
+ raise
150
+
151
+ def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
152
+ """
153
+ List all chunks associated with this document by calling the external API.
154
+
155
+ Args:
156
+ page (int): The page number to retrieve (default 1).
157
+ size (int): The number of chunks per page (default 30).
158
+ keywords (str): Keywords for searching specific chunks (default "").
159
+ available_int (int): Filter for available chunks (optional).
160
+
161
+ Returns:
162
+ list: A list of chunks returned from the API.
163
+ """
164
+ data = {
165
+ "doc_id": self.id,
166
+ "page": page,
167
+ "size": size,
168
+ "keywords": keywords,
169
+ "offset":offset,
170
+ "limit":limit
171
+ }
172
+
173
+ if available_int is not None:
174
+ data["available_int"] = available_int
175
+
176
+ res = self.post(f'/doc/chunk/list', data)
177
+ if res.status_code == 200:
178
+ res_data = res.json()
179
+ if res_data.get("retmsg") == "success":
180
+ chunks = res_data["data"]["chunks"]
181
+ self.chunks = chunks # Store the chunks in the document instance
182
+ return chunks
183
+ else:
184
+ raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
185
+ else:
186
+ raise Exception(f"API request failed with status code {res.status_code}")
187
+
188
+ def add_chunk(self, content: str):
189
+ res = self.post('/doc/chunk/create', {"doc_id": self.id, "content_with_weight":content})
190
+
191
+ # 假设返回的 response 包含 chunk 的信息
192
+ if res.status_code == 200:
193
+ chunk_data = res.json()
194
+ return Chunk(self.rag,chunk_data) # 假设有一个 Chunk 类来处理 chunk 对象
195
+ else:
196
+ raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")
sdk/python/ragflow/ragflow.py CHANGED
@@ -171,3 +171,50 @@ class RAGFlow:
171
  return Document(self, res['data'])
172
  raise Exception(res["retmsg"])
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return Document(self, res['data'])
172
  raise Exception(res["retmsg"])
173
 
174
+ def async_parse_documents(self, doc_ids):
175
+ """
176
+ Asynchronously start parsing multiple documents without waiting for completion.
177
+
178
+ :param doc_ids: A list containing multiple document IDs.
179
+ """
180
+ try:
181
+ if not doc_ids or not isinstance(doc_ids, list):
182
+ raise ValueError("doc_ids must be a non-empty list of document IDs")
183
+
184
+ data = {"doc_ids": doc_ids, "run": 1}
185
+
186
+ res = self.post(f'/doc/run', data)
187
+
188
+ if res.status_code != 200:
189
+ raise Exception(f"Failed to start async parsing for documents: {res.text}")
190
+
191
+ print(f"Async parsing started successfully for documents: {doc_ids}")
192
+
193
+ except Exception as e:
194
+ print(f"Error occurred during async parsing for documents: {str(e)}")
195
+ raise
196
+
197
+ def async_cancel_parse_documents(self, doc_ids):
198
+ """
199
+ Cancel the asynchronous parsing of multiple documents.
200
+
201
+ :param doc_ids: A list containing multiple document IDs.
202
+ """
203
+ try:
204
+ if not doc_ids or not isinstance(doc_ids, list):
205
+ raise ValueError("doc_ids must be a non-empty list of document IDs")
206
+ data = {"doc_ids": doc_ids, "run": 2}
207
+
208
+
209
+ res = self.post(f'/doc/run', data)
210
+
211
+ if res.status_code != 200:
212
+ raise Exception(f"Failed to cancel async parsing for documents: {res.text}")
213
+
214
+ print(f"Async parsing canceled successfully for documents: {doc_ids}")
215
+
216
+ except Exception as e:
217
+ print(f"Error occurred during canceling parsing for documents: {str(e)}")
218
+ raise
219
+
220
+
sdk/python/test/t_document.py CHANGED
@@ -1,4 +1,4 @@
1
- from ragflow import RAGFlow, DataSet, Document
2
 
3
  from common import API_KEY, HOST_ADDRESS
4
  from test_sdkbase import TestSdk
@@ -46,6 +46,7 @@ class TestDocument(TestSdk):
46
  doc = rag.get_document(name="TestDocument.txt")
47
  if isinstance(doc, Document):
48
  doc.parser_method = "manual"
 
49
  res = doc.save()
50
  assert res is True, f"Failed to update document, error: {res}"
51
  else:
@@ -126,8 +127,8 @@ class TestDocument(TestSdk):
126
  blob1 = b"Sample document content for ingestion test333."
127
  name2 = "Test Document444.txt"
128
  blob2 = b"Sample document content for ingestion test444."
129
- name3='test.txt'
130
- path='test_data/test.txt'
131
  rag.create_document(ds, name=name3, blob=open(path, "rb").read())
132
  rag.create_document(ds, name=name1, blob=blob1)
133
  rag.create_document(ds, name=name2, blob=blob2)
@@ -138,7 +139,131 @@ class TestDocument(TestSdk):
138
  remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
139
  assert len(remaining_docs) == 0, "Documents were not properly deleted."
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
 
 
 
 
 
 
143
 
144
 
 
1
+ from ragflow import RAGFlow, DataSet, Document, Chunk
2
 
3
  from common import API_KEY, HOST_ADDRESS
4
  from test_sdkbase import TestSdk
 
46
  doc = rag.get_document(name="TestDocument.txt")
47
  if isinstance(doc, Document):
48
  doc.parser_method = "manual"
49
+ doc.name = "manual.txt"
50
  res = doc.save()
51
  assert res is True, f"Failed to update document, error: {res}"
52
  else:
 
127
  blob1 = b"Sample document content for ingestion test333."
128
  name2 = "Test Document444.txt"
129
  blob2 = b"Sample document content for ingestion test444."
130
+ name3 = 'test.txt'
131
+ path = 'test_data/test.txt'
132
  rag.create_document(ds, name=name3, blob=open(path, "rb").read())
133
  rag.create_document(ds, name=name1, blob=blob1)
134
  rag.create_document(ds, name=name2, blob=blob2)
 
139
  remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
140
  assert len(remaining_docs) == 0, "Documents were not properly deleted."
141
 
142
+ def test_parse_and_cancel_document(self):
143
+ # Initialize RAGFlow with API key and host address
144
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
145
+
146
+ # Create a dataset with a specific name
147
+ ds = rag.create_dataset(name="God4")
148
+
149
+ # Define the document name and path
150
+ name3 = 'ai.pdf'
151
+ path = 'test_data/ai.pdf'
152
+
153
+ # Create a document in the dataset using the file path
154
+ rag.create_document(ds, name=name3, blob=open(path, "rb").read())
155
+
156
+ # Retrieve the document by name
157
+ doc = rag.get_document(name="ai.pdf")
158
+
159
+ # Initiate asynchronous parsing
160
+ doc.async_parse()
161
+
162
+ # Print message to confirm asynchronous parsing has been initiated
163
+ print("Async parsing initiated")
164
+
165
+ # Use join to wait for parsing to complete and get progress updates
166
+ for progress, msg in doc.join(interval=5, timeout=10):
167
+ print(progress, msg)
168
+ # Assert that the progress is within the valid range (0 to 100)
169
+ assert 0 <= progress <= 100, f"Invalid progress: {progress}"
170
+ # Assert that the message is not empty
171
+ assert msg, "Message should not be empty"
172
+ # Test cancelling the parsing operation
173
+ doc.cancel()
174
+ # Print message to confirm parsing has been cancelled successfully
175
+ print("Parsing cancelled successfully")
176
+
177
+ def test_bulk_parse_and_cancel_documents(self):
178
+ # Initialize RAGFlow with API key and host address
179
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
180
+
181
+ # Create a dataset
182
+ ds = rag.create_dataset(name="God5")
183
+ assert ds is not None, "Dataset creation failed"
184
+ assert ds.name == "God5", "Dataset name does not match"
185
+
186
+ # Prepare a list of file names and paths
187
+ documents = [
188
+ {'name': 'ai1.pdf', 'path': 'test_data/ai1.pdf'},
189
+ {'name': 'ai2.pdf', 'path': 'test_data/ai2.pdf'},
190
+ {'name': 'ai3.pdf', 'path': 'test_data/ai3.pdf'}
191
+ ]
192
+
193
+ # Create documents in bulk
194
+ for doc_info in documents:
195
+ with open(doc_info['path'], "rb") as file:
196
+ created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
197
+ assert created_doc is not None, f"Failed to create document {doc_info['name']}"
198
 
199
+ # Retrieve document objects in bulk
200
+ docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
201
+ ids = [doc.id for doc in docs]
202
+ assert len(docs) == len(documents), "Mismatch between created documents and fetched documents"
203
+
204
+ # Initiate asynchronous parsing for all documents
205
+ rag.async_parse_documents(ids)
206
+ print("Async bulk parsing initiated")
207
+
208
+ # Wait for all documents to finish parsing and check progress
209
+ for doc in docs:
210
+ for progress, msg in doc.join(interval=5, timeout=10):
211
+ print(f"{doc.name}: Progress: {progress}, Message: {msg}")
212
+
213
+ # Assert that progress is within the valid range
214
+ assert 0 <= progress <= 100, f"Invalid progress: {progress} for document {doc.name}"
215
+
216
+ # Assert that the message is not empty
217
+ assert msg, f"Message should not be empty for document {doc.name}"
218
+
219
+ # If progress reaches 100%, assert that parsing is completed successfully
220
+ if progress == 100:
221
+ assert "completed" in msg.lower(), f"Document {doc.name} did not complete successfully"
222
+
223
+ # Cancel parsing for all documents in bulk
224
+ cancel_result = rag.async_cancel_parse_documents(ids)
225
+ assert cancel_result is None or isinstance(cancel_result, type(None)), "Failed to cancel document parsing"
226
+ print("Async bulk parsing cancelled")
227
+
228
+ def test_parse_document_and_chunk_list(self):
229
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
230
+ ds = rag.create_dataset(name="God7")
231
+ name='story.txt'
232
+ path = 'test_data/story.txt'
233
+ # name = "Test Document rag.txt"
234
+ # blob = " Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps.Sample document content for rag test66. rag wonderful apple os documents apps.Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps."
235
+ rag.create_document(ds, name=name, blob=open(path, "rb").read())
236
+ doc = rag.get_document(name=name)
237
+ doc.async_parse()
238
+
239
+ # Wait for parsing to complete and get progress updates using join
240
+ for progress, msg in doc.join(interval=5, timeout=30):
241
+ print(progress, msg)
242
+ # Assert that progress is within 0 to 100
243
+ assert 0 <= progress <= 100, f"Invalid progress: {progress}"
244
+ # Assert that the message is not empty
245
+ assert msg, "Message should not be empty"
246
+
247
+ for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
248
+ print(c)
249
+ assert c is not None, "Chunk is None"
250
+ assert "rag" in c['content_with_weight'].lower(), f"Keyword 'rag' not found in chunk content: {c.content}"
251
+ def test_add_chunk_to_chunk_list(self):
252
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
253
+ doc = rag.get_document(name='story.txt')
254
+ chunk = doc.add_chunk(content="assss")
255
+ assert chunk is not None, "Chunk is None"
256
+ assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
257
+
258
+ def test_delete_chunk_of_chunk_list(self):
259
+ rag = RAGFlow(API_KEY, HOST_ADDRESS)
260
+ doc = rag.get_document(name='story.txt')
261
 
262
+ chunk = doc.add_chunk(content="assss")
263
+ assert chunk is not None, "Chunk is None"
264
+ assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
265
+ chunk_num_before=doc.chunk_num
266
+ chunk.delete()
267
+ assert doc.chunk_num == chunk_num_before-1, "Chunk was not deleted"
268
 
269
 
sdk/python/test/test_data/ragflow_test.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ Introducing RagFlow: Revolutionizing Natural Language Processing with Retrieval-Augmented Generation
4
+
5
+ In the ever-evolving landscape of Natural Language Processing (NLP), new techniques and frameworks continue to push the boundaries of what machines can understand and generate from human language. Among these innovative advancements, RagFlow stands out as a pioneering approach that combines the power of retrieval and generation to revolutionize the way we interact with text-based data.
6
+
7
+ What is RagFlow?
8
+
9
+ RagFlow, short for Retrieval-Augmented Generation Flow, is a framework designed to enhance the capabilities of NLP models by integrating a retrieval component into the generation process. This approach leverages large-scale knowledge bases and text corpora to retrieve relevant information that can inform and enrich the output generated by the model. By doing so, RagFlow enables models to produce more accurate, informative, and contextually relevant responses, surpassing the limitations of traditional generation-only or retrieval-only systems.
10
+
11
+ The Core Concept
12
+
13
+ At its core, RagFlow operates on two fundamental principles:
14
+
15
+ Retrieval: The first step involves identifying and retrieving relevant information from a vast collection of text sources. This can include web pages, academic articles, books, or any other form of unstructured text data. RagFlow employs advanced retrieval algorithms, often based on neural networks and vector similarity, to quickly and accurately locate the most pertinent information for a given query or task.
16
+ Generation: Once relevant information has been retrieved, RagFlow leverages generative NLP models to produce the final output. These models, such as transformers or GPT-like architectures, are trained to understand the context provided by the retrieved information and generate coherent, fluent text that incorporates this knowledge. The integration of retrieval and generation allows RagFlow to generate responses that are not only grammatically correct but also semantically rich and contextually appropriate.
17
+ Advantages of RagFlow
18
+
19
+ Increased Accuracy and Relevance: By incorporating retrieved information, RagFlow can generate responses that are more accurate and relevant to the user's query or task. This is particularly useful in domains where factual accuracy and contextual relevance are crucial, such as question answering, summarization, and knowledge-intensive dialogue systems.
20
+ Scalability and Flexibility: RagFlow's reliance on large-scale text corpora and retrieval algorithms makes it highly scalable to new domains and datasets. As more data becomes available, the retrieval component can be easily updated to incorporate new information, while the generative model can be fine-tuned to adapt to specific tasks or user preferences.
21
+ Improved Efficiency: By leveraging pre-existing knowledge bases and retrieval algorithms, RagFlow can reduce the computational burden on the generative model. This allows the model to focus on generating high-quality output rather than searching for relevant information from scratch, resulting in improved efficiency and faster response times.
22
+ Applications and Future Directions
23
+
24
+ RagFlow has the potential to transform a wide range of NLP applications, including but not limited to:
25
+
26
+ Question Answering Systems: By retrieving relevant passages and generating precise answers, RagFlow can enhance the accuracy and comprehensiveness of question answering systems.
27
+ Document Summarization: By identifying key information and generating concise summaries, RagFlow can help users quickly grasp the main points of lengthy documents.
28
+ Creative Writing and Storytelling: By incorporating retrieved elements into the generation process, RagFlow can inspire and augment creative writing, enabling machines to produce more engaging and original stories.
29
+ As the field of NLP continues to evolve, RagFlow represents a promising direction for leveraging the power of both retrieval and generation. With further research and development, we can expect to see even more sophisticated and versatile RagFlow-based systems that push the boundaries of what machines can achieve with human language.
sdk/python/test/test_data/story.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Once upon a time, in a small village nestled at the foot of a towering mountain, lived a young girl named Lily. Lily had a heart as pure as the mountain's snowcaps and a spirit as adventurous as the winding trails that led to its peak.
2
+ One day, as Lily was gathering berries in the forest's edge, she stumbled upon an old, weathered map hidden beneath a fallen tree. The map was covered in strange symbols and a single, glowing word: "Treasure." Curiousity piqued, Lily decided to embark on a quest to uncover the mystery of the treasure.
3
+ With nothing more than her trusty basket of berries, a few pieces of bread, and the map, Lily set off into the unknown. As she climbed higher and higher into the mountains, the air grew crisp, and the scenery transformed into a breathtaking tapestry of lush greenery and sparkling streams.
4
+ Along the way, Lily encountered all sorts of challenges. She had to navigate treacherous rivers using fallen logs as bridges, climb steep cliffs with nothing but her agility and determination, and even outsmart a mischievous pack of foxes that tried to lead her astray. But through it all, Lily remained steadfast, her heart filled with hope and a sense of purpose.
5
+ Finally, after what seemed like an eternity of trekking, Lily arrived at a hidden valley. At its center stood an ancient tree, its roots entwined with glittering jewels and a chest made of pure gold. This, the map had revealed, was the source of the treasure.
6
+ But as Lily approached the chest, she realized that the true treasure was not the riches before her. It was the journey itself—the friendships she had forged with the animals she encountered, the strength she had gained from overcoming obstacles, and the sense of wonder and discovery that filled her heart.
7
+ With a smile on her face, Lily gently closed the chest and left it where it was, content in the knowledge that the greatest treasures in life are not always found in gold or jewels. She turned back towards home, her heart full of stories to share and a spirit that had been forever changed by her adventure.
8
+ And so, Lily returned to her village, a hero in her own right, with a tale that would be whispered around firesides for generations to come.
sdk/python/test/test_data/test1.txt CHANGED
@@ -1,2 +1,4 @@
1
  test1
2
- test1
 
 
 
1
  test1
2
+ test1
3
+ aaaa document args arg
4
+ rag document
sdk/python/test/test_data/test2.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ test22
2
+ test22
3
+ aaaa document args arg
4
+ rag document
sdk/python/test/test_data/test3.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ test3
2
+ test333
3
+ aaaa document args arg
4
+ rag document
sdk/python/test/test_data/westworld.pdf ADDED
Binary file (33.1 kB). View file