add support for eml file parser (#1768)
Browse files### What problem does this PR solve?
add support for eml file parser
#1363
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: Zhedong Cen <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>
- api/apps/dataset_api.py +3 -1
- api/db/__init__.py +1 -0
- api/db/init_data.py +1 -1
- api/settings.py +1 -1
- api/utils/file_utils.py +1 -1
- deepdoc/parser/__init__.py +2 -1
- deepdoc/parser/html_parser.py +6 -1
- deepdoc/parser/txt_parser.py +42 -0
- rag/app/email.py +114 -0
- rag/app/naive.py +3 -20
- rag/svr/task_executor.py +2 -1
- web/src/components/chunk-method-modal/hooks.ts +2 -1
api/apps/dataset_api.py
CHANGED
@@ -39,7 +39,7 @@ from api.utils import get_uuid
|
|
39 |
from api.utils.api_utils import construct_json_result, construct_error_response
|
40 |
from api.utils.api_utils import construct_result, validate_request
|
41 |
from api.utils.file_utils import filename_type, thumbnail
|
42 |
-
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
|
43 |
from rag.nlp import search
|
44 |
from rag.utils.es_conn import ELASTICSEARCH
|
45 |
from rag.utils.minio_conn import MINIO
|
@@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
|
|
652 |
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
653 |
case "audio":
|
654 |
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
|
|
|
|
655 |
case _:
|
656 |
return False
|
657 |
|
|
|
39 |
from api.utils.api_utils import construct_json_result, construct_error_response
|
40 |
from api.utils.api_utils import construct_result, validate_request
|
41 |
from api.utils.file_utils import filename_type, thumbnail
|
42 |
+
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
|
43 |
from rag.nlp import search
|
44 |
from rag.utils.es_conn import ELASTICSEARCH
|
45 |
from rag.utils.minio_conn import MINIO
|
|
|
652 |
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
653 |
case "audio":
|
654 |
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
655 |
+
case "email":
|
656 |
+
email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
657 |
case _:
|
658 |
return False
|
659 |
|
api/db/__init__.py
CHANGED
@@ -85,6 +85,7 @@ class ParserType(StrEnum):
|
|
85 |
PICTURE = "picture"
|
86 |
ONE = "one"
|
87 |
AUDIO = "audio"
|
|
|
88 |
KG = "knowledge_graph"
|
89 |
|
90 |
|
|
|
85 |
PICTURE = "picture"
|
86 |
ONE = "one"
|
87 |
AUDIO = "audio"
|
88 |
+
EMAIL = "email"
|
89 |
KG = "knowledge_graph"
|
90 |
|
91 |
|
api/db/init_data.py
CHANGED
@@ -122,7 +122,7 @@ def init_llm_factory():
|
|
122 |
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
123 |
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
124 |
TenantService.filter_update([1 == 1], {
|
125 |
-
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
|
126 |
## insert openai two embedding models to the current openai user.
|
127 |
print("Start to insert 2 OpenAI embedding models...")
|
128 |
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
|
|
122 |
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
123 |
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
124 |
TenantService.filter_update([1 == 1], {
|
125 |
+
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
|
126 |
## insert openai two embedding models to the current openai user.
|
127 |
print("Start to insert 2 OpenAI embedding models...")
|
128 |
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
api/settings.py
CHANGED
@@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
|
|
132 |
API_KEY = LLM.get("api_key", "")
|
133 |
PARSERS = LLM.get(
|
134 |
"parsers",
|
135 |
-
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
|
136 |
|
137 |
# distribution
|
138 |
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
|
|
132 |
API_KEY = LLM.get("api_key", "")
|
133 |
PARSERS = LLM.get(
|
134 |
"parsers",
|
135 |
+
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")
|
136 |
|
137 |
# distribution
|
138 |
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
api/utils/file_utils.py
CHANGED
@@ -156,7 +156,7 @@ def filename_type(filename):
|
|
156 |
return FileType.PDF.value
|
157 |
|
158 |
if re.match(
|
159 |
-
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
160 |
return FileType.DOC.value
|
161 |
|
162 |
if re.match(
|
|
|
156 |
return FileType.PDF.value
|
157 |
|
158 |
if re.match(
|
159 |
+
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
160 |
return FileType.DOC.value
|
161 |
|
162 |
if re.match(
|
deepdoc/parser/__init__.py
CHANGED
@@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
|
|
17 |
from .ppt_parser import RAGFlowPptParser as PptParser
|
18 |
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
19 |
from .json_parser import RAGFlowJsonParser as JsonParser
|
20 |
-
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
|
|
|
17 |
from .ppt_parser import RAGFlowPptParser as PptParser
|
18 |
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
19 |
from .json_parser import RAGFlowJsonParser as JsonParser
|
20 |
+
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
21 |
+
from .txt_parser import RAGFlowTxtParser as TxtParser
|
deepdoc/parser/html_parser.py
CHANGED
@@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
|
|
30 |
else:
|
31 |
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
32 |
txt = f.read()
|
|
|
33 |
|
|
|
|
|
|
|
|
|
34 |
html_doc = readability.Document(txt)
|
35 |
title = html_doc.title()
|
36 |
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
37 |
-
txt = f
|
38 |
sections = txt.split("\n")
|
39 |
return sections
|
|
|
30 |
else:
|
31 |
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
32 |
txt = f.read()
|
33 |
+
return self.parser_txt(txt)
|
34 |
|
35 |
+
@classmethod
|
36 |
+
def parser_txt(cls, txt):
|
37 |
+
if type(txt) != str:
|
38 |
+
raise TypeError("txt type should be str!")
|
39 |
html_doc = readability.Document(txt)
|
40 |
title = html_doc.title()
|
41 |
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
42 |
+
txt = f"{title}\n{content}"
|
43 |
sections = txt.split("\n")
|
44 |
return sections
|
deepdoc/parser/txt_parser.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
2 |
+
# you may not use this file except in compliance with the License.
|
3 |
+
# You may obtain a copy of the License at
|
4 |
+
#
|
5 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
6 |
+
#
|
7 |
+
# Unless required by applicable law or agreed to in writing, software
|
8 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10 |
+
# See the License for the specific language governing permissions and
|
11 |
+
# limitations under the License.
|
12 |
+
#
|
13 |
+
|
14 |
+
from rag.nlp import find_codec,num_tokens_from_string
|
15 |
+
|
16 |
+
class RAGFlowTxtParser:
|
17 |
+
def __call__(self, fnm, binary=None, chunk_token_num=128):
|
18 |
+
txt = ""
|
19 |
+
if binary:
|
20 |
+
encoding = find_codec(binary)
|
21 |
+
txt = binary.decode(encoding, errors="ignore")
|
22 |
+
else:
|
23 |
+
with open(fnm, "r") as f:
|
24 |
+
while True:
|
25 |
+
l = f.readline()
|
26 |
+
if not l:
|
27 |
+
break
|
28 |
+
txt += l
|
29 |
+
return self.parser_txt(txt, chunk_token_num)
|
30 |
+
|
31 |
+
@classmethod
|
32 |
+
def parser_txt(cls, txt, chunk_token_num=128):
|
33 |
+
if type(txt) != str:
|
34 |
+
raise TypeError("txt type should be str!")
|
35 |
+
sections = []
|
36 |
+
for sec in txt.split("\n"):
|
37 |
+
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
38 |
+
sections.append((sec[: int(len(sec) / 2)], ""))
|
39 |
+
sections.append((sec[int(len(sec) / 2) :], ""))
|
40 |
+
else:
|
41 |
+
sections.append((sec, ""))
|
42 |
+
return sections
|
rag/app/email.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
2 |
+
# you may not use this file except in compliance with the License.
|
3 |
+
# You may obtain a copy of the License at
|
4 |
+
#
|
5 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
6 |
+
#
|
7 |
+
# Unless required by applicable law or agreed to in writing, software
|
8 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10 |
+
# See the License for the specific language governing permissions and
|
11 |
+
# limitations under the License.
|
12 |
+
#
|
13 |
+
|
14 |
+
from email import policy
|
15 |
+
from email.parser import BytesParser
|
16 |
+
from rag.app.naive import chunk as naive_chunk
|
17 |
+
import re
|
18 |
+
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
19 |
+
from deepdoc.parser import HtmlParser, TxtParser
|
20 |
+
from timeit import default_timer as timer
|
21 |
+
from rag.settings import cron_logger
|
22 |
+
import io
|
23 |
+
|
24 |
+
|
25 |
+
def chunk(
|
26 |
+
filename,
|
27 |
+
binary=None,
|
28 |
+
from_page=0,
|
29 |
+
to_page=100000,
|
30 |
+
lang="Chinese",
|
31 |
+
callback=None,
|
32 |
+
**kwargs,
|
33 |
+
):
|
34 |
+
"""
|
35 |
+
Only eml is supported
|
36 |
+
"""
|
37 |
+
eng = lang.lower() == "english" # is_english(cks)
|
38 |
+
parser_config = kwargs.get(
|
39 |
+
"parser_config",
|
40 |
+
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
|
41 |
+
)
|
42 |
+
doc = {
|
43 |
+
"docnm_kwd": filename,
|
44 |
+
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
45 |
+
}
|
46 |
+
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
47 |
+
main_res = []
|
48 |
+
attachment_res = []
|
49 |
+
|
50 |
+
if binary:
|
51 |
+
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
|
52 |
+
else:
|
53 |
+
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
|
54 |
+
|
55 |
+
text_txt, html_txt = [], []
|
56 |
+
# get the email header info
|
57 |
+
for header, value in msg.items():
|
58 |
+
text_txt.append(f"{header}: {value}")
|
59 |
+
|
60 |
+
# get the email main info
|
61 |
+
def _add_content(msg, content_type):
|
62 |
+
if content_type == "text/plain":
|
63 |
+
text_txt.append(
|
64 |
+
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
65 |
+
)
|
66 |
+
elif content_type == "text/html":
|
67 |
+
html_txt.append(
|
68 |
+
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
69 |
+
)
|
70 |
+
elif "multipart" in content_type:
|
71 |
+
if msg.is_multipart():
|
72 |
+
for part in msg.iter_parts():
|
73 |
+
_add_content(part, part.get_content_type())
|
74 |
+
|
75 |
+
_add_content(msg, msg.get_content_type())
|
76 |
+
|
77 |
+
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
|
78 |
+
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
|
79 |
+
]
|
80 |
+
|
81 |
+
st = timer()
|
82 |
+
chunks = naive_merge(
|
83 |
+
sections,
|
84 |
+
int(parser_config.get("chunk_token_num", 128)),
|
85 |
+
parser_config.get("delimiter", "\n!?。;!?"),
|
86 |
+
)
|
87 |
+
|
88 |
+
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
89 |
+
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
90 |
+
# get the attachment info
|
91 |
+
for part in msg.iter_attachments():
|
92 |
+
content_disposition = part.get("Content-Disposition")
|
93 |
+
if content_disposition:
|
94 |
+
dispositions = content_disposition.strip().split(";")
|
95 |
+
if dispositions[0].lower() == "attachment":
|
96 |
+
filename = part.get_filename()
|
97 |
+
payload = part.get_payload(decode=True)
|
98 |
+
try:
|
99 |
+
attachment_res.extend(
|
100 |
+
naive_chunk(filename, payload, callback=callback, **kwargs)
|
101 |
+
)
|
102 |
+
except Exception:
|
103 |
+
pass
|
104 |
+
|
105 |
+
return main_res + attachment_res
|
106 |
+
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
import sys
|
110 |
+
|
111 |
+
def dummy(prog=None, msg=""):
|
112 |
+
pass
|
113 |
+
|
114 |
+
chunk(sys.argv[1], callback=dummy)
|
rag/app/naive.py
CHANGED
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
|
|
17 |
import re
|
18 |
from deepdoc.parser.pdf_parser import PlainParser
|
19 |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
20 |
-
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
|
21 |
from rag.settings import cron_logger
|
22 |
from rag.utils import num_tokens_from_string
|
23 |
from PIL import Image
|
@@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
|
|
170 |
return sections, tbls
|
171 |
|
172 |
|
|
|
173 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
174 |
lang="Chinese", callback=None, **kwargs):
|
175 |
"""
|
@@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
222 |
|
223 |
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
224 |
callback(0.1, "Start to parse.")
|
225 |
-
|
226 |
-
if binary:
|
227 |
-
encoding = find_codec(binary)
|
228 |
-
txt = binary.decode(encoding, errors="ignore")
|
229 |
-
else:
|
230 |
-
with open(filename, "r") as f:
|
231 |
-
while True:
|
232 |
-
l = f.readline()
|
233 |
-
if not l:
|
234 |
-
break
|
235 |
-
txt += l
|
236 |
-
sections = []
|
237 |
-
for sec in txt.split("\n"):
|
238 |
-
if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
|
239 |
-
sections.append((sec[:int(len(sec)/2)], ""))
|
240 |
-
sections.append((sec[int(len(sec)/2):], ""))
|
241 |
-
else:
|
242 |
-
sections.append((sec, ""))
|
243 |
-
|
244 |
callback(0.8, "Finish parsing.")
|
245 |
|
246 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
|
|
17 |
import re
|
18 |
from deepdoc.parser.pdf_parser import PlainParser
|
19 |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
20 |
+
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
|
21 |
from rag.settings import cron_logger
|
22 |
from rag.utils import num_tokens_from_string
|
23 |
from PIL import Image
|
|
|
170 |
return sections, tbls
|
171 |
|
172 |
|
173 |
+
|
174 |
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
175 |
lang="Chinese", callback=None, **kwargs):
|
176 |
"""
|
|
|
223 |
|
224 |
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
225 |
callback(0.1, "Start to parse.")
|
226 |
+
sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
callback(0.8, "Finish parsing.")
|
228 |
|
229 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
rag/svr/task_executor.py
CHANGED
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
|
|
45 |
from io import BytesIO
|
46 |
import pandas as pd
|
47 |
|
48 |
-
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
|
49 |
|
50 |
from api.db import LLMType, ParserType
|
51 |
from api.db.services.document_service import DocumentService
|
@@ -69,6 +69,7 @@ FACTORY = {
|
|
69 |
ParserType.PICTURE.value: picture,
|
70 |
ParserType.ONE.value: one,
|
71 |
ParserType.AUDIO.value: audio,
|
|
|
72 |
ParserType.KG.value: knowledge_graph
|
73 |
}
|
74 |
|
|
|
45 |
from io import BytesIO
|
46 |
import pandas as pd
|
47 |
|
48 |
+
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
|
49 |
|
50 |
from api.db import LLMType, ParserType
|
51 |
from api.db.services.document_service import DocumentService
|
|
|
69 |
ParserType.PICTURE.value: picture,
|
70 |
ParserType.ONE.value: one,
|
71 |
ParserType.AUDIO.value: audio,
|
72 |
+
ParserType.EMAIL.value: email,
|
73 |
ParserType.KG.value: knowledge_graph
|
74 |
}
|
75 |
|
web/src/components/chunk-method-modal/hooks.ts
CHANGED
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
|
|
27 |
'one',
|
28 |
'qa',
|
29 |
'manual',
|
30 |
-
'knowledge_graph'
|
31 |
],
|
32 |
],
|
33 |
[
|
@@ -67,6 +67,7 @@ const ParserListMap = new Map([
|
|
67 |
],
|
68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
69 |
[['json'], ['naive', 'knowledge_graph']],
|
|
|
70 |
]);
|
71 |
|
72 |
const getParserList = (
|
|
|
27 |
'one',
|
28 |
'qa',
|
29 |
'manual',
|
30 |
+
'knowledge_graph'
|
31 |
],
|
32 |
],
|
33 |
[
|
|
|
67 |
],
|
68 |
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
69 |
[['json'], ['naive', 'knowledge_graph']],
|
70 |
+
[['eml'], ['email']]
|
71 |
]);
|
72 |
|
73 |
const getParserList = (
|