add support for eml file parser (#1768)
Browse files### What problem does this PR solve?
add support for eml file parser
#1363
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: Zhedong Cen <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>
- api/apps/dataset_api.py +3 -1
- api/db/__init__.py +1 -0
- api/db/init_data.py +1 -1
- api/settings.py +1 -1
- api/utils/file_utils.py +1 -1
- deepdoc/parser/__init__.py +2 -1
- deepdoc/parser/html_parser.py +6 -1
- deepdoc/parser/txt_parser.py +42 -0
- rag/app/email.py +114 -0
- rag/app/naive.py +3 -20
- rag/svr/task_executor.py +2 -1
- web/src/components/chunk-method-modal/hooks.ts +2 -1
api/apps/dataset_api.py
CHANGED
|
@@ -39,7 +39,7 @@ from api.utils import get_uuid
|
|
| 39 |
from api.utils.api_utils import construct_json_result, construct_error_response
|
| 40 |
from api.utils.api_utils import construct_result, validate_request
|
| 41 |
from api.utils.file_utils import filename_type, thumbnail
|
| 42 |
-
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
|
| 43 |
from rag.nlp import search
|
| 44 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 45 |
from rag.utils.minio_conn import MINIO
|
|
@@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
|
|
| 652 |
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
| 653 |
case "audio":
|
| 654 |
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
|
|
|
|
|
|
| 655 |
case _:
|
| 656 |
return False
|
| 657 |
|
|
|
|
| 39 |
from api.utils.api_utils import construct_json_result, construct_error_response
|
| 40 |
from api.utils.api_utils import construct_result, validate_request
|
| 41 |
from api.utils.file_utils import filename_type, thumbnail
|
| 42 |
+
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
|
| 43 |
from rag.nlp import search
|
| 44 |
from rag.utils.es_conn import ELASTICSEARCH
|
| 45 |
from rag.utils.minio_conn import MINIO
|
|
|
|
| 652 |
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
| 653 |
case "audio":
|
| 654 |
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
| 655 |
+
case "email":
|
| 656 |
+
email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
| 657 |
case _:
|
| 658 |
return False
|
| 659 |
|
api/db/__init__.py
CHANGED
|
@@ -85,6 +85,7 @@ class ParserType(StrEnum):
|
|
| 85 |
PICTURE = "picture"
|
| 86 |
ONE = "one"
|
| 87 |
AUDIO = "audio"
|
|
|
|
| 88 |
KG = "knowledge_graph"
|
| 89 |
|
| 90 |
|
|
|
|
| 85 |
PICTURE = "picture"
|
| 86 |
ONE = "one"
|
| 87 |
AUDIO = "audio"
|
| 88 |
+
EMAIL = "email"
|
| 89 |
KG = "knowledge_graph"
|
| 90 |
|
| 91 |
|
api/db/init_data.py
CHANGED
|
@@ -122,7 +122,7 @@ def init_llm_factory():
|
|
| 122 |
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
| 123 |
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
| 124 |
TenantService.filter_update([1 == 1], {
|
| 125 |
-
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
|
| 126 |
## insert openai two embedding models to the current openai user.
|
| 127 |
print("Start to insert 2 OpenAI embedding models...")
|
| 128 |
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
|
|
|
| 122 |
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
| 123 |
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
| 124 |
TenantService.filter_update([1 == 1], {
|
| 125 |
+
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
|
| 126 |
## insert openai two embedding models to the current openai user.
|
| 127 |
print("Start to insert 2 OpenAI embedding models...")
|
| 128 |
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
api/settings.py
CHANGED
|
@@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
|
|
| 132 |
API_KEY = LLM.get("api_key", "")
|
| 133 |
PARSERS = LLM.get(
|
| 134 |
"parsers",
|
| 135 |
-
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
|
| 136 |
|
| 137 |
# distribution
|
| 138 |
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
|
|
|
| 132 |
API_KEY = LLM.get("api_key", "")
|
| 133 |
PARSERS = LLM.get(
|
| 134 |
"parsers",
|
| 135 |
+
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")
|
| 136 |
|
| 137 |
# distribution
|
| 138 |
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
api/utils/file_utils.py
CHANGED
|
@@ -156,7 +156,7 @@ def filename_type(filename):
|
|
| 156 |
return FileType.PDF.value
|
| 157 |
|
| 158 |
if re.match(
|
| 159 |
-
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
| 160 |
return FileType.DOC.value
|
| 161 |
|
| 162 |
if re.match(
|
|
|
|
| 156 |
return FileType.PDF.value
|
| 157 |
|
| 158 |
if re.match(
|
| 159 |
+
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
| 160 |
return FileType.DOC.value
|
| 161 |
|
| 162 |
if re.match(
|
deepdoc/parser/__init__.py
CHANGED
|
@@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
|
|
| 17 |
from .ppt_parser import RAGFlowPptParser as PptParser
|
| 18 |
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
| 19 |
from .json_parser import RAGFlowJsonParser as JsonParser
|
| 20 |
-
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
|
|
|
|
|
| 17 |
from .ppt_parser import RAGFlowPptParser as PptParser
|
| 18 |
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
| 19 |
from .json_parser import RAGFlowJsonParser as JsonParser
|
| 20 |
+
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
| 21 |
+
from .txt_parser import RAGFlowTxtParser as TxtParser
|
deepdoc/parser/html_parser.py
CHANGED
|
@@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
|
|
| 30 |
else:
|
| 31 |
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
| 32 |
txt = f.read()
|
|
|
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
html_doc = readability.Document(txt)
|
| 35 |
title = html_doc.title()
|
| 36 |
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
| 37 |
-
txt = f
|
| 38 |
sections = txt.split("\n")
|
| 39 |
return sections
|
|
|
|
| 30 |
else:
|
| 31 |
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
| 32 |
txt = f.read()
|
| 33 |
+
return self.parser_txt(txt)
|
| 34 |
|
| 35 |
+
@classmethod
|
| 36 |
+
def parser_txt(cls, txt):
|
| 37 |
+
if type(txt) != str:
|
| 38 |
+
raise TypeError("txt type should be str!")
|
| 39 |
html_doc = readability.Document(txt)
|
| 40 |
title = html_doc.title()
|
| 41 |
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
| 42 |
+
txt = f"{title}\n{content}"
|
| 43 |
sections = txt.split("\n")
|
| 44 |
return sections
|
deepdoc/parser/txt_parser.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 2 |
+
# you may not use this file except in compliance with the License.
|
| 3 |
+
# You may obtain a copy of the License at
|
| 4 |
+
#
|
| 5 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 6 |
+
#
|
| 7 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 8 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 9 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 10 |
+
# See the License for the specific language governing permissions and
|
| 11 |
+
# limitations under the License.
|
| 12 |
+
#
|
| 13 |
+
|
| 14 |
+
from rag.nlp import find_codec,num_tokens_from_string
|
| 15 |
+
|
| 16 |
+
class RAGFlowTxtParser:
|
| 17 |
+
def __call__(self, fnm, binary=None, chunk_token_num=128):
|
| 18 |
+
txt = ""
|
| 19 |
+
if binary:
|
| 20 |
+
encoding = find_codec(binary)
|
| 21 |
+
txt = binary.decode(encoding, errors="ignore")
|
| 22 |
+
else:
|
| 23 |
+
with open(fnm, "r") as f:
|
| 24 |
+
while True:
|
| 25 |
+
l = f.readline()
|
| 26 |
+
if not l:
|
| 27 |
+
break
|
| 28 |
+
txt += l
|
| 29 |
+
return self.parser_txt(txt, chunk_token_num)
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
def parser_txt(cls, txt, chunk_token_num=128):
|
| 33 |
+
if type(txt) != str:
|
| 34 |
+
raise TypeError("txt type should be str!")
|
| 35 |
+
sections = []
|
| 36 |
+
for sec in txt.split("\n"):
|
| 37 |
+
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
| 38 |
+
sections.append((sec[: int(len(sec) / 2)], ""))
|
| 39 |
+
sections.append((sec[int(len(sec) / 2) :], ""))
|
| 40 |
+
else:
|
| 41 |
+
sections.append((sec, ""))
|
| 42 |
+
return sections
|
rag/app/email.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 2 |
+
# you may not use this file except in compliance with the License.
|
| 3 |
+
# You may obtain a copy of the License at
|
| 4 |
+
#
|
| 5 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 6 |
+
#
|
| 7 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 8 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 9 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 10 |
+
# See the License for the specific language governing permissions and
|
| 11 |
+
# limitations under the License.
|
| 12 |
+
#
|
| 13 |
+
|
| 14 |
+
from email import policy
|
| 15 |
+
from email.parser import BytesParser
|
| 16 |
+
from rag.app.naive import chunk as naive_chunk
|
| 17 |
+
import re
|
| 18 |
+
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
| 19 |
+
from deepdoc.parser import HtmlParser, TxtParser
|
| 20 |
+
from timeit import default_timer as timer
|
| 21 |
+
from rag.settings import cron_logger
|
| 22 |
+
import io
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def chunk(
|
| 26 |
+
filename,
|
| 27 |
+
binary=None,
|
| 28 |
+
from_page=0,
|
| 29 |
+
to_page=100000,
|
| 30 |
+
lang="Chinese",
|
| 31 |
+
callback=None,
|
| 32 |
+
**kwargs,
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Only eml is supported
|
| 36 |
+
"""
|
| 37 |
+
eng = lang.lower() == "english" # is_english(cks)
|
| 38 |
+
parser_config = kwargs.get(
|
| 39 |
+
"parser_config",
|
| 40 |
+
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
|
| 41 |
+
)
|
| 42 |
+
doc = {
|
| 43 |
+
"docnm_kwd": filename,
|
| 44 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
| 45 |
+
}
|
| 46 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
| 47 |
+
main_res = []
|
| 48 |
+
attachment_res = []
|
| 49 |
+
|
| 50 |
+
if binary:
|
| 51 |
+
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
|
| 52 |
+
else:
|
| 53 |
+
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
|
| 54 |
+
|
| 55 |
+
text_txt, html_txt = [], []
|
| 56 |
+
# get the email header info
|
| 57 |
+
for header, value in msg.items():
|
| 58 |
+
text_txt.append(f"{header}: {value}")
|
| 59 |
+
|
| 60 |
+
# get the email main info
|
| 61 |
+
def _add_content(msg, content_type):
|
| 62 |
+
if content_type == "text/plain":
|
| 63 |
+
text_txt.append(
|
| 64 |
+
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
| 65 |
+
)
|
| 66 |
+
elif content_type == "text/html":
|
| 67 |
+
html_txt.append(
|
| 68 |
+
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
| 69 |
+
)
|
| 70 |
+
elif "multipart" in content_type:
|
| 71 |
+
if msg.is_multipart():
|
| 72 |
+
for part in msg.iter_parts():
|
| 73 |
+
_add_content(part, part.get_content_type())
|
| 74 |
+
|
| 75 |
+
_add_content(msg, msg.get_content_type())
|
| 76 |
+
|
| 77 |
+
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
|
| 78 |
+
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
st = timer()
|
| 82 |
+
chunks = naive_merge(
|
| 83 |
+
sections,
|
| 84 |
+
int(parser_config.get("chunk_token_num", 128)),
|
| 85 |
+
parser_config.get("delimiter", "\n!?。;!?"),
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
| 89 |
+
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
| 90 |
+
# get the attachment info
|
| 91 |
+
for part in msg.iter_attachments():
|
| 92 |
+
content_disposition = part.get("Content-Disposition")
|
| 93 |
+
if content_disposition:
|
| 94 |
+
dispositions = content_disposition.strip().split(";")
|
| 95 |
+
if dispositions[0].lower() == "attachment":
|
| 96 |
+
filename = part.get_filename()
|
| 97 |
+
payload = part.get_payload(decode=True)
|
| 98 |
+
try:
|
| 99 |
+
attachment_res.extend(
|
| 100 |
+
naive_chunk(filename, payload, callback=callback, **kwargs)
|
| 101 |
+
)
|
| 102 |
+
except Exception:
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
return main_res + attachment_res
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
import sys
|
| 110 |
+
|
| 111 |
+
def dummy(prog=None, msg=""):
|
| 112 |
+
pass
|
| 113 |
+
|
| 114 |
+
chunk(sys.argv[1], callback=dummy)
|
rag/app/naive.py
CHANGED
|
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
|
|
| 17 |
import re
|
| 18 |
from deepdoc.parser.pdf_parser import PlainParser
|
| 19 |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
| 20 |
-
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
|
| 21 |
from rag.settings import cron_logger
|
| 22 |
from rag.utils import num_tokens_from_string
|
| 23 |
from PIL import Image
|
|
@@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
|
|
| 170 |
return sections, tbls
|
| 171 |
|
| 172 |
|
|
|
|
| 173 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
| 174 |
lang="Chinese", callback=None, **kwargs):
|
| 175 |
"""
|
|
@@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 222 |
|
| 223 |
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
| 224 |
callback(0.1, "Start to parse.")
|
| 225 |
-
|
| 226 |
-
if binary:
|
| 227 |
-
encoding = find_codec(binary)
|
| 228 |
-
txt = binary.decode(encoding, errors="ignore")
|
| 229 |
-
else:
|
| 230 |
-
with open(filename, "r") as f:
|
| 231 |
-
while True:
|
| 232 |
-
l = f.readline()
|
| 233 |
-
if not l:
|
| 234 |
-
break
|
| 235 |
-
txt += l
|
| 236 |
-
sections = []
|
| 237 |
-
for sec in txt.split("\n"):
|
| 238 |
-
if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
|
| 239 |
-
sections.append((sec[:int(len(sec)/2)], ""))
|
| 240 |
-
sections.append((sec[int(len(sec)/2):], ""))
|
| 241 |
-
else:
|
| 242 |
-
sections.append((sec, ""))
|
| 243 |
-
|
| 244 |
callback(0.8, "Finish parsing.")
|
| 245 |
|
| 246 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
|
|
|
| 17 |
import re
|
| 18 |
from deepdoc.parser.pdf_parser import PlainParser
|
| 19 |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
| 20 |
+
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
|
| 21 |
from rag.settings import cron_logger
|
| 22 |
from rag.utils import num_tokens_from_string
|
| 23 |
from PIL import Image
|
|
|
|
| 170 |
return sections, tbls
|
| 171 |
|
| 172 |
|
| 173 |
+
|
| 174 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
| 175 |
lang="Chinese", callback=None, **kwargs):
|
| 176 |
"""
|
|
|
|
| 223 |
|
| 224 |
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
| 225 |
callback(0.1, "Start to parse.")
|
| 226 |
+
sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
callback(0.8, "Finish parsing.")
|
| 228 |
|
| 229 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
rag/svr/task_executor.py
CHANGED
|
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
|
|
| 45 |
from io import BytesIO
|
| 46 |
import pandas as pd
|
| 47 |
|
| 48 |
-
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
|
| 49 |
|
| 50 |
from api.db import LLMType, ParserType
|
| 51 |
from api.db.services.document_service import DocumentService
|
|
@@ -69,6 +69,7 @@ FACTORY = {
|
|
| 69 |
ParserType.PICTURE.value: picture,
|
| 70 |
ParserType.ONE.value: one,
|
| 71 |
ParserType.AUDIO.value: audio,
|
|
|
|
| 72 |
ParserType.KG.value: knowledge_graph
|
| 73 |
}
|
| 74 |
|
|
|
|
| 45 |
from io import BytesIO
|
| 46 |
import pandas as pd
|
| 47 |
|
| 48 |
+
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
|
| 49 |
|
| 50 |
from api.db import LLMType, ParserType
|
| 51 |
from api.db.services.document_service import DocumentService
|
|
|
|
| 69 |
ParserType.PICTURE.value: picture,
|
| 70 |
ParserType.ONE.value: one,
|
| 71 |
ParserType.AUDIO.value: audio,
|
| 72 |
+
ParserType.EMAIL.value: email,
|
| 73 |
ParserType.KG.value: knowledge_graph
|
| 74 |
}
|
| 75 |
|
web/src/components/chunk-method-modal/hooks.ts
CHANGED
|
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
|
|
| 27 |
'one',
|
| 28 |
'qa',
|
| 29 |
'manual',
|
| 30 |
-
'knowledge_graph'
|
| 31 |
],
|
| 32 |
],
|
| 33 |
[
|
|
@@ -67,6 +67,7 @@ const ParserListMap = new Map([
|
|
| 67 |
],
|
| 68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
| 69 |
[['json'], ['naive', 'knowledge_graph']],
|
|
|
|
| 70 |
]);
|
| 71 |
|
| 72 |
const getParserList = (
|
|
|
|
| 27 |
'one',
|
| 28 |
'qa',
|
| 29 |
'manual',
|
| 30 |
+
'knowledge_graph'
|
| 31 |
],
|
| 32 |
],
|
| 33 |
[
|
|
|
|
| 67 |
],
|
| 68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
| 69 |
[['json'], ['naive', 'knowledge_graph']],
|
| 70 |
+
[['eml'], ['email']]
|
| 71 |
]);
|
| 72 |
|
| 73 |
const getParserList = (
|