devMls
Miguel
commited on
Commit
·
ffeefff
1
Parent(s):
5548e5b
organize chunks by document in the prompt (#3925)
Browse files### What problem does this PR solve?
This PR organize chunks in the prompt by document and indicate what is
the name of the document in this way
```
Document: {doc_name} \nContains the following relevant fragments:
chunk1
chunk2
chunk3
Document: {doc_name} \nContains the following relevant fragments:
chunk4
chunk5
```
Maybe can be a baseline to add metadata to the documents.
This allow in my case to improve llm context about the orgin of the
information.
### Type of change
- [X] New Feature (non-breaking change which adds functionality)
Co-authored-by: Miguel <your-noreply-github-email>
api/db/services/dialog_service.py
CHANGED
|
@@ -195,7 +195,32 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
| 195 |
dialog.vector_similarity_weight,
|
| 196 |
doc_ids=attachments,
|
| 197 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
logging.debug(
|
| 200 |
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
| 201 |
retrieval_tm = timer()
|
|
@@ -592,12 +617,40 @@ def ask(question, kb_ids, tenant_id):
|
|
| 592 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
| 593 |
|
| 594 |
used_token_count = 0
|
|
|
|
| 595 |
for i, c in enumerate(knowledges):
|
| 596 |
used_token_count += num_tokens_from_string(c)
|
| 597 |
if max_tokens * 0.97 < used_token_count:
|
| 598 |
knowledges = knowledges[:i]
|
|
|
|
| 599 |
break
|
| 600 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
prompt = """
|
| 602 |
Role: You're a smart assistant. Your name is Miss R.
|
| 603 |
Task: Summarize the information from knowledge bases and answer user's question.
|
|
|
|
| 195 |
dialog.vector_similarity_weight,
|
| 196 |
doc_ids=attachments,
|
| 197 |
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
|
| 198 |
+
|
| 199 |
+
# Group chunks by document ID
|
| 200 |
+
doc_chunks = {}
|
| 201 |
+
for ck in kbinfos["chunks"]:
|
| 202 |
+
doc_id = ck["doc_id"]
|
| 203 |
+
if doc_id not in doc_chunks:
|
| 204 |
+
doc_chunks[doc_id] = []
|
| 205 |
+
doc_chunks[doc_id].append(ck["content_with_weight"])
|
| 206 |
+
|
| 207 |
+
# Create knowledges list with grouped chunks
|
| 208 |
+
knowledges = []
|
| 209 |
+
for doc_id, chunks in doc_chunks.items():
|
| 210 |
+
# Find the corresponding document name
|
| 211 |
+
doc_name = next((d["doc_name"] for d in kbinfos.get("doc_aggs", []) if d["doc_id"] == doc_id), doc_id)
|
| 212 |
+
|
| 213 |
+
# Create a header for the document
|
| 214 |
+
doc_knowledge = f"Document: {doc_name} \nContains the following relevant fragments:\n"
|
| 215 |
+
|
| 216 |
+
# Add numbered fragments
|
| 217 |
+
for i, chunk in enumerate(chunks, 1):
|
| 218 |
+
doc_knowledge += f"{i}. {chunk}\n"
|
| 219 |
+
|
| 220 |
+
knowledges.append(doc_knowledge)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
| 224 |
logging.debug(
|
| 225 |
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
| 226 |
retrieval_tm = timer()
|
|
|
|
| 617 |
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
| 618 |
|
| 619 |
used_token_count = 0
|
| 620 |
+
chunks_num = 0
|
| 621 |
for i, c in enumerate(knowledges):
|
| 622 |
used_token_count += num_tokens_from_string(c)
|
| 623 |
if max_tokens * 0.97 < used_token_count:
|
| 624 |
knowledges = knowledges[:i]
|
| 625 |
+
chunks_num = chunks_num + 1
|
| 626 |
break
|
| 627 |
|
| 628 |
+
# Group chunks by document ID
|
| 629 |
+
doc_chunks = {}
|
| 630 |
+
counter_chunks = 0
|
| 631 |
+
for ck in kbinfos["chunks"]:
|
| 632 |
+
if counter_chunks < chunks_num:
|
| 633 |
+
counter_chunks = counter_chunks + 1
|
| 634 |
+
doc_id = ck["doc_id"]
|
| 635 |
+
if doc_id not in doc_chunks:
|
| 636 |
+
doc_chunks[doc_id] = []
|
| 637 |
+
doc_chunks[doc_id].append(ck["content_with_weight"])
|
| 638 |
+
|
| 639 |
+
# Create knowledges list with grouped chunks
|
| 640 |
+
knowledges = []
|
| 641 |
+
for doc_id, chunks in doc_chunks.items():
|
| 642 |
+
# Find the corresponding document name
|
| 643 |
+
doc_name = next((d["doc_name"] for d in kbinfos.get("doc_aggs", []) if d["doc_id"] == doc_id), doc_id)
|
| 644 |
+
|
| 645 |
+
# Create a header for the document
|
| 646 |
+
doc_knowledge = f"Document: {doc_name} \nContains the following relevant fragments:\n"
|
| 647 |
+
|
| 648 |
+
# Add numbered fragments
|
| 649 |
+
for i, chunk in enumerate(chunks, 1):
|
| 650 |
+
doc_knowledge += f"{i}. {chunk}\n"
|
| 651 |
+
|
| 652 |
+
knowledges.append(doc_knowledge)
|
| 653 |
+
|
| 654 |
prompt = """
|
| 655 |
Role: You're a smart assistant. Your name is Miss R.
|
| 656 |
Task: Summarize the information from knowledge bases and answer user's question.
|