GYH commited on
Commit
f4cd7c3
·
1 Parent(s): 0711582

0517 list chunks (#821)

Browse files

### What problem does this PR solve?

#717

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/api_app.py CHANGED
@@ -39,6 +39,9 @@ from itsdangerous import URLSafeTimedSerializer
39
  from api.utils.file_utils import filename_type, thumbnail
40
  from rag.utils.minio_conn import MINIO
41
 
 
 
 
42
 
43
  def generate_confirmation_token(tenent_id):
44
  serializer = URLSafeTimedSerializer(tenent_id)
@@ -347,3 +350,43 @@ def upload():
347
  return server_error_response(e)
348
 
349
  return get_json_result(data=doc_result.to_json())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  from api.utils.file_utils import filename_type, thumbnail
40
  from rag.utils.minio_conn import MINIO
41
 
42
+ from rag.utils.es_conn import ELASTICSEARCH
43
+ from rag.nlp import search
44
+ from elasticsearch_dsl import Q
45
 
46
  def generate_confirmation_token(tenent_id):
47
  serializer = URLSafeTimedSerializer(tenent_id)
 
350
  return server_error_response(e)
351
 
352
  return get_json_result(data=doc_result.to_json())
353
+
354
+
355
+ @manager.route('/list_chunks', methods=['POST'])
356
+ # @login_required
357
+ def list_chunks():
358
+ token = request.headers.get('Authorization').split()[1]
359
+ objs = APIToken.query(token=token)
360
+ if not objs:
361
+ return get_json_result(
362
+ data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)
363
+
364
+ form_data = request.form
365
+
366
+ try:
367
+ if "doc_name" in form_data.keys():
368
+ tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name'])
369
+ q = Q("match", docnm_kwd=form_data['doc_name'])
370
+
371
+ elif "doc_id" in form_data.keys():
372
+ tenant_id = DocumentService.get_tenant_id(form_data['doc_id'])
373
+ q = Q("match", doc_id=form_data['doc_id'])
374
+ else:
375
+ return get_json_result(
376
+ data=False,retmsg="Can't find doc_name or doc_id"
377
+ )
378
+
379
+ res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s")
380
+
381
+ res = [{} for _ in range(len(res_es_search['hits']['hits']))]
382
+
383
+ for index , chunk in enumerate(res_es_search['hits']['hits']):
384
+ res[index]['doc_name'] = chunk['_source']['docnm_kwd']
385
+ res[index]['content'] = chunk['_source']['content_with_weight']
386
+ if 'img_id' in chunk['_source'].keys():
387
+ res[index]['img_id'] = chunk['_source']['img_id']
388
+
389
+ except Exception as e:
390
+ return server_error_response(e)
391
+
392
+ return get_json_result(data=res)
api/db/services/document_service.py CHANGED
@@ -166,6 +166,19 @@ class DocumentService(CommonService):
166
  return
167
  return docs[0]["tenant_id"]
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  @classmethod
170
  @DB.connection_context()
171
  def get_thumbnails(cls, docids):
 
166
  return
167
  return docs[0]["tenant_id"]
168
 
169
+ @classmethod
170
+ @DB.connection_context()
171
+ def get_tenant_id_by_name(cls, name):
172
+ docs = cls.model.select(
173
+ Knowledgebase.tenant_id).join(
174
+ Knowledgebase, on=(
175
+ Knowledgebase.id == cls.model.kb_id)).where(
176
+ cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
177
+ docs = docs.dicts()
178
+ if not docs:
179
+ return
180
+ return docs[0]["tenant_id"]
181
+
182
  @classmethod
183
  @DB.connection_context()
184
  def get_thumbnails(cls, docids):
docs/conversation_api.md CHANGED
@@ -364,3 +364,38 @@ This is usually used when upload a file to.
364
  }
365
 
366
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  }
365
 
366
  ```
367
+
368
+ ## Get document chunks
369
+
370
+ Get the chunks of the document based on doc_name or doc_id.
371
+ ### Path: /api/list_chunks/
372
+ ### Method: POST
373
+
374
+ ### Parameter:
375
+
376
+ | Name | Type | Optional | Description |
377
+ |----------|--------|----------|---------------------------------|
378
+ | `doc_name` | string | Yes | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.|
379
+ | `doc_id` | string | Yes | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.|
380
+
381
+
382
+ ### Response
383
+ ```json
384
+ {
385
+ "data": [
386
+ {
387
+ "content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K",
388
+ "doc_name": "RL-Cache.pdf",
389
+ "img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211"
390
+ },
391
+ {
392
+ "content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how e￿ectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES",
393
+ "doc_name": "RL-Cache.pdf",
394
+ "img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7"
395
+ }
396
+ ],
397
+ "retcode": 0,
398
+ "retmsg": "success"
399
+ }
400
+
401
+ ```