Kevin Hu
commited on
Commit
·
9cfd69b
1
Parent(s):
2f2501f
Code refactor. (#4291)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- agent/component/answer.py +10 -0
- api/apps/canvas_app.py +4 -0
- api/apps/dialog_app.py +2 -4
- api/apps/kb_app.py +2 -1
- api/utils/api_utils.py +4 -0
- graphrag/graph_prompt.py +4 -4
- graphrag/utils.py +1 -1
- rag/app/laws.py +3 -5
- rag/app/manual.py +2 -2
- rag/app/table.py +1 -1
agent/component/answer.py
CHANGED
|
@@ -16,6 +16,7 @@
|
|
| 16 |
import random
|
| 17 |
from abc import ABC
|
| 18 |
from functools import partial
|
|
|
|
| 19 |
|
| 20 |
import pandas as pd
|
| 21 |
|
|
@@ -76,4 +77,13 @@ class Answer(ComponentBase, ABC):
|
|
| 76 |
def set_exception(self, e):
|
| 77 |
self.exception = e
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
|
|
|
| 16 |
import random
|
| 17 |
from abc import ABC
|
| 18 |
from functools import partial
|
| 19 |
+
from typing import Tuple, Union
|
| 20 |
|
| 21 |
import pandas as pd
|
| 22 |
|
|
|
|
| 77 |
def set_exception(self, e):
|
| 78 |
self.exception = e
|
| 79 |
|
| 80 |
+
def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
|
| 81 |
+
if allow_partial:
|
| 82 |
+
return super.output()
|
| 83 |
+
|
| 84 |
+
for r, c in self._canvas.history[::-1]:
|
| 85 |
+
if r == "user":
|
| 86 |
+
return self._param.output_var_name, pd.DataFrame([{"content": c}])
|
| 87 |
+
|
| 88 |
+
self._param.output_var_name, pd.DataFrame([])
|
| 89 |
|
api/apps/canvas_app.py
CHANGED
|
@@ -146,12 +146,16 @@ def run():
|
|
| 146 |
|
| 147 |
canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
|
| 148 |
canvas.history.append(("assistant", final_ans["content"]))
|
|
|
|
|
|
|
| 149 |
if final_ans.get("reference"):
|
| 150 |
canvas.reference.append(final_ans["reference"])
|
| 151 |
cvs.dsl = json.loads(str(canvas))
|
| 152 |
UserCanvasService.update_by_id(req["id"], cvs.to_dict())
|
| 153 |
except Exception as e:
|
| 154 |
cvs.dsl = json.loads(str(canvas))
|
|
|
|
|
|
|
| 155 |
UserCanvasService.update_by_id(req["id"], cvs.to_dict())
|
| 156 |
traceback.print_exc()
|
| 157 |
yield "data:" + json.dumps({"code": 500, "message": str(e),
|
|
|
|
| 146 |
|
| 147 |
canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
|
| 148 |
canvas.history.append(("assistant", final_ans["content"]))
|
| 149 |
+
if not canvas.path[-1]:
|
| 150 |
+
canvas.path.pop(-1)
|
| 151 |
if final_ans.get("reference"):
|
| 152 |
canvas.reference.append(final_ans["reference"])
|
| 153 |
cvs.dsl = json.loads(str(canvas))
|
| 154 |
UserCanvasService.update_by_id(req["id"], cvs.to_dict())
|
| 155 |
except Exception as e:
|
| 156 |
cvs.dsl = json.loads(str(canvas))
|
| 157 |
+
if not canvas.path[-1]:
|
| 158 |
+
canvas.path.pop(-1)
|
| 159 |
UserCanvasService.update_by_id(req["id"], cvs.to_dict())
|
| 160 |
traceback.print_exc()
|
| 161 |
yield "data:" + json.dumps({"code": 500, "message": str(e),
|
api/apps/dialog_app.py
CHANGED
|
@@ -103,10 +103,7 @@ def set_dialog():
|
|
| 103 |
}
|
| 104 |
if not DialogService.save(**dia):
|
| 105 |
return get_data_error_result(message="Fail to new a dialog!")
|
| 106 |
-
|
| 107 |
-
if not e:
|
| 108 |
-
return get_data_error_result(message="Fail to new a dialog!")
|
| 109 |
-
return get_json_result(data=dia.to_json())
|
| 110 |
else:
|
| 111 |
del req["dialog_id"]
|
| 112 |
if "kb_names" in req:
|
|
@@ -117,6 +114,7 @@ def set_dialog():
|
|
| 117 |
if not e:
|
| 118 |
return get_data_error_result(message="Fail to update a dialog!")
|
| 119 |
dia = dia.to_dict()
|
|
|
|
| 120 |
dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
|
| 121 |
return get_json_result(data=dia)
|
| 122 |
except Exception as e:
|
|
|
|
| 103 |
}
|
| 104 |
if not DialogService.save(**dia):
|
| 105 |
return get_data_error_result(message="Fail to new a dialog!")
|
| 106 |
+
return get_json_result(data=dia)
|
|
|
|
|
|
|
|
|
|
| 107 |
else:
|
| 108 |
del req["dialog_id"]
|
| 109 |
if "kb_names" in req:
|
|
|
|
| 114 |
if not e:
|
| 115 |
return get_data_error_result(message="Fail to update a dialog!")
|
| 116 |
dia = dia.to_dict()
|
| 117 |
+
dia.update(req)
|
| 118 |
dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
|
| 119 |
return get_json_result(data=dia)
|
| 120 |
except Exception as e:
|
api/apps/kb_app.py
CHANGED
|
@@ -185,7 +185,8 @@ def rm():
|
|
| 185 |
return get_data_error_result(
|
| 186 |
message="Database error (Document removal)!")
|
| 187 |
f2d = File2DocumentService.get_by_document_id(doc.id)
|
| 188 |
-
|
|
|
|
| 189 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 190 |
FileService.filter_delete(
|
| 191 |
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
|
|
|
|
| 185 |
return get_data_error_result(
|
| 186 |
message="Database error (Document removal)!")
|
| 187 |
f2d = File2DocumentService.get_by_document_id(doc.id)
|
| 188 |
+
if f2d:
|
| 189 |
+
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
|
| 190 |
File2DocumentService.delete_by_document_id(doc.id)
|
| 191 |
FileService.filter_delete(
|
| 192 |
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])
|
api/utils/api_utils.py
CHANGED
|
@@ -120,6 +120,10 @@ def server_error_response(e):
|
|
| 120 |
if len(e.args) > 1:
|
| 121 |
return get_json_result(
|
| 122 |
code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
|
| 124 |
|
| 125 |
|
|
|
|
| 120 |
if len(e.args) > 1:
|
| 121 |
return get_json_result(
|
| 122 |
code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
|
| 123 |
+
if repr(e).find("index_not_found_exception") >= 0:
|
| 124 |
+
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR,
|
| 125 |
+
message="No chunk found, please upload file and parse it.")
|
| 126 |
+
|
| 127 |
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
|
| 128 |
|
| 129 |
|
graphrag/graph_prompt.py
CHANGED
|
@@ -11,20 +11,20 @@ Given a text document that is potentially relevant to this activity and a list o
|
|
| 11 |
|
| 12 |
-Steps-
|
| 13 |
1. Identify all entities. For each identified entity, extract the following information:
|
| 14 |
-
- entity_name: Name of the entity, capitalized
|
| 15 |
- entity_type: One of the following types: [{entity_types}]
|
| 16 |
-
- entity_description: Comprehensive description of the entity's attributes and activities
|
| 17 |
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
|
| 18 |
|
| 19 |
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
|
| 20 |
For each pair of related entities, extract the following information:
|
| 21 |
- source_entity: name of the source entity, as identified in step 1
|
| 22 |
- target_entity: name of the target entity, as identified in step 1
|
| 23 |
-
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
|
| 24 |
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
|
| 25 |
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
|
| 26 |
|
| 27 |
-
3. Return output
|
| 28 |
|
| 29 |
4. When finished, output {completion_delimiter}
|
| 30 |
|
|
|
|
| 11 |
|
| 12 |
-Steps-
|
| 13 |
1. Identify all entities. For each identified entity, extract the following information:
|
| 14 |
+
- entity_name: Name of the entity, capitalized, in language of 'Text'
|
| 15 |
- entity_type: One of the following types: [{entity_types}]
|
| 16 |
+
- entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text'
|
| 17 |
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
|
| 18 |
|
| 19 |
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
|
| 20 |
For each pair of related entities, extract the following information:
|
| 21 |
- source_entity: name of the source entity, as identified in step 1
|
| 22 |
- target_entity: name of the target entity, as identified in step 1
|
| 23 |
+
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text'
|
| 24 |
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
|
| 25 |
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
|
| 26 |
|
| 27 |
+
3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
|
| 28 |
|
| 29 |
4. When finished, output {completion_delimiter}
|
| 30 |
|
graphrag/utils.py
CHANGED
|
@@ -81,7 +81,7 @@ def get_llm_cache(llmnm, txt, history, genconf):
|
|
| 81 |
return bin
|
| 82 |
|
| 83 |
|
| 84 |
-
def set_llm_cache(llmnm, txt, v
|
| 85 |
hasher = xxhash.xxh64()
|
| 86 |
hasher.update(str(llmnm).encode("utf-8"))
|
| 87 |
hasher.update(str(txt).encode("utf-8"))
|
|
|
|
| 81 |
return bin
|
| 82 |
|
| 83 |
|
| 84 |
+
def set_llm_cache(llmnm, txt, v, history, genconf):
|
| 85 |
hasher = xxhash.xxh64()
|
| 86 |
hasher.update(str(llmnm).encode("utf-8"))
|
| 87 |
hasher.update(str(txt).encode("utf-8"))
|
rag/app/laws.py
CHANGED
|
@@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 153 |
|
| 154 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 155 |
callback(0.1, "Start to parse.")
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
chunks = sections
|
| 160 |
-
return tokenize_chunks(chunks, doc, eng, pdf_parser)
|
| 161 |
|
| 162 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 163 |
pdf_parser = Pdf() if kwargs.get(
|
|
|
|
| 153 |
|
| 154 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 155 |
callback(0.1, "Start to parse.")
|
| 156 |
+
chunks = Docx()(filename, binary)
|
| 157 |
+
callback(0.7, "Finish parsing.")
|
| 158 |
+
return tokenize_chunks(chunks, doc, eng, None)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 161 |
pdf_parser = Pdf() if kwargs.get(
|
rag/app/manual.py
CHANGED
|
@@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 193 |
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
|
| 194 |
# set pivot using the most frequent type of title,
|
| 195 |
# then merge between 2 pivot
|
| 196 |
-
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.
|
| 197 |
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
| 198 |
most_level = max(0, max_lvl - 1)
|
| 199 |
levels = []
|
|
@@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 256 |
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
| 257 |
return res
|
| 258 |
|
| 259 |
-
|
| 260 |
docx_parser = Docx()
|
| 261 |
ti_list, tbls = docx_parser(filename, binary,
|
| 262 |
from_page=0, to_page=10000, callback=callback)
|
|
|
|
| 193 |
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
|
| 194 |
# set pivot using the most frequent type of title,
|
| 195 |
# then merge between 2 pivot
|
| 196 |
+
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
| 197 |
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
| 198 |
most_level = max(0, max_lvl - 1)
|
| 199 |
levels = []
|
|
|
|
| 256 |
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
| 257 |
return res
|
| 258 |
|
| 259 |
+
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 260 |
docx_parser = Docx()
|
| 261 |
ti_list, tbls = docx_parser(filename, binary,
|
| 262 |
from_page=0, to_page=10000, callback=callback)
|
rag/app/table.py
CHANGED
|
@@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|
| 185 |
"datetime": "_dt",
|
| 186 |
"bool": "_kwd"}
|
| 187 |
for df in dfs:
|
| 188 |
-
for n in ["id", "index", "idx"]:
|
| 189 |
if n in df.columns:
|
| 190 |
del df[n]
|
| 191 |
clmns = df.columns.values
|
|
|
|
| 185 |
"datetime": "_dt",
|
| 186 |
"bool": "_kwd"}
|
| 187 |
for df in dfs:
|
| 188 |
+
for n in ["id", "_id", "index", "idx"]:
|
| 189 |
if n in df.columns:
|
| 190 |
del df[n]
|
| 191 |
clmns = df.columns.values
|