KevinHuSh
commited on
Commit
·
0442b90
1
Parent(s):
369d05c
upgrade laws parser of docx (#1332)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- api/apps/chunk_app.py +5 -1
- api/db/services/dialog_service.py +3 -1
- rag/app/laws.py +27 -45
- rag/nlp/__init__.py +21 -6
api/apps/chunk_app.py
CHANGED
@@ -20,7 +20,7 @@ from flask_login import login_required, current_user
|
|
20 |
from elasticsearch_dsl import Q
|
21 |
|
22 |
from rag.app.qa import rmPrefix, beAdoc
|
23 |
-
from rag.nlp import search, rag_tokenizer
|
24 |
from rag.utils.es_conn import ELASTICSEARCH
|
25 |
from rag.utils import rmSpace
|
26 |
from api.db import LLMType, ParserType
|
@@ -268,6 +268,10 @@ def retrieval_test():
|
|
268 |
rerank_mdl = TenantLLMService.model_instance(
|
269 |
kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])
|
270 |
|
|
|
|
|
|
|
|
|
271 |
ranks = retrievaler.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size,
|
272 |
similarity_threshold, vector_similarity_weight, top,
|
273 |
doc_ids, rerank_mdl=rerank_mdl)
|
|
|
20 |
from elasticsearch_dsl import Q
|
21 |
|
22 |
from rag.app.qa import rmPrefix, beAdoc
|
23 |
+
from rag.nlp import search, rag_tokenizer, keyword_extraction
|
24 |
from rag.utils.es_conn import ELASTICSEARCH
|
25 |
from rag.utils import rmSpace
|
26 |
from api.db import LLMType, ParserType
|
|
|
268 |
rerank_mdl = TenantLLMService.model_instance(
|
269 |
kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])
|
270 |
|
271 |
+
if req.get("keyword", False):
|
272 |
+
chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
|
273 |
+
question += keyword_extraction(chat_mdl, question)
|
274 |
+
|
275 |
ranks = retrievaler.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size,
|
276 |
similarity_threshold, vector_similarity_weight, top,
|
277 |
doc_ids, rerank_mdl=rerank_mdl)
|
api/db/services/dialog_service.py
CHANGED
@@ -23,7 +23,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|
23 |
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
24 |
from api.settings import chat_logger, retrievaler
|
25 |
from rag.app.resume import forbidden_select_fields4resume
|
26 |
-
from rag.nlp
|
27 |
from rag.nlp.search import index_name
|
28 |
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
29 |
|
@@ -121,6 +121,8 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|
121 |
if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
|
122 |
kbinfos = {"total": 0, "chunks": [], "doc_aggs": []}
|
123 |
else:
|
|
|
|
|
124 |
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
125 |
dialog.similarity_threshold,
|
126 |
dialog.vector_similarity_weight,
|
|
|
23 |
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
24 |
from api.settings import chat_logger, retrievaler
|
25 |
from rag.app.resume import forbidden_select_fields4resume
|
26 |
+
from rag.nlp import keyword_extraction
|
27 |
from rag.nlp.search import index_name
|
28 |
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
29 |
|
|
|
121 |
if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
|
122 |
kbinfos = {"total": 0, "chunks": [], "doc_aggs": []}
|
123 |
else:
|
124 |
+
if prompt_config.get("keyword", False):
|
125 |
+
questions[-1] += keyword_extraction(chat_mdl, questions[-1])
|
126 |
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
127 |
dialog.similarity_threshold,
|
128 |
dialog.vector_similarity_weight,
|
rag/app/laws.py
CHANGED
@@ -54,62 +54,44 @@ class Docx(DocxParser):
|
|
54 |
self.doc = Document(
|
55 |
filename) if not binary else Document(BytesIO(binary))
|
56 |
pn = 0
|
57 |
-
last_question, last_answer, last_level = "", "", -1
|
58 |
lines = []
|
59 |
-
root = DocxNode()
|
60 |
-
point = root
|
61 |
bull = bullets_category([p.text for p in self.doc.paragraphs])
|
62 |
for p in self.doc.paragraphs:
|
63 |
if pn > to_page:
|
64 |
break
|
65 |
-
question_level, p_text =
|
66 |
-
if
|
67 |
-
|
68 |
-
|
69 |
-
last_answer = f'{last_answer}\n{p_text}'
|
70 |
-
else: # is a question
|
71 |
-
if last_question:
|
72 |
-
while last_level <= point.level:
|
73 |
-
point = point.parent
|
74 |
-
new_node = DocxNode(last_question, last_answer, last_level, [], point)
|
75 |
-
point.childs.append(new_node)
|
76 |
-
point = new_node
|
77 |
-
last_question, last_answer, last_level = '', '', -1
|
78 |
-
last_level = question_level
|
79 |
-
last_answer = ''
|
80 |
-
last_question = p_text
|
81 |
-
|
82 |
for run in p.runs:
|
83 |
if 'lastRenderedPageBreak' in run._element.xml:
|
84 |
pn += 1
|
85 |
continue
|
86 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
87 |
pn += 1
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
self.childs = childs
|
112 |
-
self.parent = parent
|
113 |
def __str__(self) -> str:
|
114 |
return f'''
|
115 |
question:{self.question},
|
|
|
54 |
self.doc = Document(
|
55 |
filename) if not binary else Document(BytesIO(binary))
|
56 |
pn = 0
|
|
|
57 |
lines = []
|
|
|
|
|
58 |
bull = bullets_category([p.text for p in self.doc.paragraphs])
|
59 |
for p in self.doc.paragraphs:
|
60 |
if pn > to_page:
|
61 |
break
|
62 |
+
question_level, p_text = docx_question_level(p, bull)
|
63 |
+
if not p_text.strip("\n"):continue
|
64 |
+
lines.append((question_level, p_text))
|
65 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
for run in p.runs:
|
67 |
if 'lastRenderedPageBreak' in run._element.xml:
|
68 |
pn += 1
|
69 |
continue
|
70 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
71 |
pn += 1
|
72 |
+
|
73 |
+
visit = [False for _ in range(len(lines))]
|
74 |
+
sections = []
|
75 |
+
for s in range(len(lines)):
|
76 |
+
e = s + 1
|
77 |
+
while e < len(lines):
|
78 |
+
if lines[e][0] <= lines[s][0]:
|
79 |
+
break
|
80 |
+
e += 1
|
81 |
+
if e - s == 1 and visit[s]: continue
|
82 |
+
sec = []
|
83 |
+
next_level = lines[s][0] + 1
|
84 |
+
while not sec and next_level < 22:
|
85 |
+
for i in range(s+1, e):
|
86 |
+
if lines[i][0] != next_level: continue
|
87 |
+
sec.append(lines[i][1])
|
88 |
+
visit[i] = True
|
89 |
+
next_level += 1
|
90 |
+
sec.insert(0, lines[s][1])
|
91 |
+
|
92 |
+
sections.append("\n".join(sec))
|
93 |
+
return [l for l in sections if l]
|
94 |
+
|
|
|
|
|
95 |
def __str__(self) -> str:
|
96 |
return f'''
|
97 |
question:{self.question},
|
rag/nlp/__init__.py
CHANGED
@@ -514,16 +514,19 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
514 |
|
515 |
return cks
|
516 |
|
|
|
517 |
def docx_question_level(p, bull = -1):
|
|
|
518 |
if p.style.name.startswith('Heading'):
|
519 |
-
return int(p.style.name.split(' ')[-1]),
|
520 |
else:
|
521 |
if bull < 0:
|
522 |
-
return 0,
|
523 |
for j, title in enumerate(BULLET_PATTERN[bull]):
|
524 |
-
if re.match(title,
|
525 |
-
return j+1,
|
526 |
-
return
|
|
|
527 |
|
528 |
def concat_img(img1, img2):
|
529 |
if img1 and not img2:
|
@@ -544,6 +547,7 @@ def concat_img(img1, img2):
|
|
544 |
|
545 |
return new_image
|
546 |
|
|
|
547 |
def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
548 |
if not sections:
|
549 |
return []
|
@@ -573,4 +577,15 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|
573 |
for sec, image in sections:
|
574 |
add_chunk(sec, image, '')
|
575 |
|
576 |
-
return cks, images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
|
515 |
return cks
|
516 |
|
517 |
+
|
518 |
def docx_question_level(p, bull = -1):
|
519 |
+
txt = re.sub(r"\u3000", " ", p.text).strip()
|
520 |
if p.style.name.startswith('Heading'):
|
521 |
+
return int(p.style.name.split(' ')[-1]), txt
|
522 |
else:
|
523 |
if bull < 0:
|
524 |
+
return 0, txt
|
525 |
for j, title in enumerate(BULLET_PATTERN[bull]):
|
526 |
+
if re.match(title, txt):
|
527 |
+
return j+1, txt
|
528 |
+
return len(BULLET_PATTERN[bull]), txt
|
529 |
+
|
530 |
|
531 |
def concat_img(img1, img2):
|
532 |
if img1 and not img2:
|
|
|
547 |
|
548 |
return new_image
|
549 |
|
550 |
+
|
551 |
def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
552 |
if not sections:
|
553 |
return []
|
|
|
577 |
for sec, image in sections:
|
578 |
add_chunk(sec, image, '')
|
579 |
|
580 |
+
return cks, images
|
581 |
+
|
582 |
+
|
583 |
+
def keyword_extraction(chat_mdl, content):
|
584 |
+
prompt = """
|
585 |
+
You're a question analyzer.
|
586 |
+
1. Please give me the most important keyword/phrase of this question.
|
587 |
+
Answer format: (in language of user's question)
|
588 |
+
- keyword:
|
589 |
+
"""
|
590 |
+
kwd, _ = chat_mdl.chat(prompt, [{"role": "user", "content": content}], {"temperature": 0.2})
|
591 |
+
return kwd
|