黄腾 aopstudio Kevin Hu commited on
Commit
6ed07a9
·
1 Parent(s): 29fdf3e

add support for eml file parser (#1768)

Browse files

### What problem does this PR solve?

add support for eml file parser
#1363

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>

api/apps/dataset_api.py CHANGED
@@ -39,7 +39,7 @@ from api.utils import get_uuid
39
  from api.utils.api_utils import construct_json_result, construct_error_response
40
  from api.utils.api_utils import construct_result, validate_request
41
  from api.utils.file_utils import filename_type, thumbnail
42
- from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
43
  from rag.nlp import search
44
  from rag.utils.es_conn import ELASTICSEARCH
45
  from rag.utils.minio_conn import MINIO
@@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
652
  table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
653
  case "audio":
654
  audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
 
 
655
  case _:
656
  return False
657
 
 
39
  from api.utils.api_utils import construct_json_result, construct_error_response
40
  from api.utils.api_utils import construct_result, validate_request
41
  from api.utils.file_utils import filename_type, thumbnail
42
+ from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
43
  from rag.nlp import search
44
  from rag.utils.es_conn import ELASTICSEARCH
45
  from rag.utils.minio_conn import MINIO
 
652
  table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
653
  case "audio":
654
  audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
655
+ case "email":
656
+ email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
657
  case _:
658
  return False
659
 
api/db/__init__.py CHANGED
@@ -85,6 +85,7 @@ class ParserType(StrEnum):
85
  PICTURE = "picture"
86
  ONE = "one"
87
  AUDIO = "audio"
 
88
  KG = "knowledge_graph"
89
 
90
 
 
85
  PICTURE = "picture"
86
  ONE = "one"
87
  AUDIO = "audio"
88
+ EMAIL = "email"
89
  KG = "knowledge_graph"
90
 
91
 
api/db/init_data.py CHANGED
@@ -122,7 +122,7 @@ def init_llm_factory():
122
  LLMService.filter_delete([LLMService.model.fid == "QAnything"])
123
  TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
124
  TenantService.filter_update([1 == 1], {
125
- "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
126
  ## insert openai two embedding models to the current openai user.
127
  print("Start to insert 2 OpenAI embedding models...")
128
  tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
 
122
  LLMService.filter_delete([LLMService.model.fid == "QAnything"])
123
  TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
124
  TenantService.filter_update([1 == 1], {
125
+ "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
126
  ## insert openai two embedding models to the current openai user.
127
  print("Start to insert 2 OpenAI embedding models...")
128
  tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
api/settings.py CHANGED
@@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
132
  API_KEY = LLM.get("api_key", "")
133
  PARSERS = LLM.get(
134
  "parsers",
135
- "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
136
 
137
  # distribution
138
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
 
132
  API_KEY = LLM.get("api_key", "")
133
  PARSERS = LLM.get(
134
  "parsers",
135
+ "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")
136
 
137
  # distribution
138
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
api/utils/file_utils.py CHANGED
@@ -156,7 +156,7 @@ def filename_type(filename):
156
  return FileType.PDF.value
157
 
158
  if re.match(
159
- r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
160
  return FileType.DOC.value
161
 
162
  if re.match(
 
156
  return FileType.PDF.value
157
 
158
  if re.match(
159
+ r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
160
  return FileType.DOC.value
161
 
162
  if re.match(
deepdoc/parser/__init__.py CHANGED
@@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
17
  from .ppt_parser import RAGFlowPptParser as PptParser
18
  from .html_parser import RAGFlowHtmlParser as HtmlParser
19
  from .json_parser import RAGFlowJsonParser as JsonParser
20
- from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
 
 
17
  from .ppt_parser import RAGFlowPptParser as PptParser
18
  from .html_parser import RAGFlowHtmlParser as HtmlParser
19
  from .json_parser import RAGFlowJsonParser as JsonParser
20
+ from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
21
+ from .txt_parser import RAGFlowTxtParser as TxtParser
deepdoc/parser/html_parser.py CHANGED
@@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
30
  else:
31
  with open(fnm, "r",encoding=get_encoding(fnm)) as f:
32
  txt = f.read()
 
33
 
 
 
 
 
34
  html_doc = readability.Document(txt)
35
  title = html_doc.title()
36
  content = html_text.extract_text(html_doc.summary(html_partial=True))
37
- txt = f'{title}\n{content}'
38
  sections = txt.split("\n")
39
  return sections
 
30
  else:
31
  with open(fnm, "r",encoding=get_encoding(fnm)) as f:
32
  txt = f.read()
33
+ return self.parser_txt(txt)
34
 
35
+ @classmethod
36
+ def parser_txt(cls, txt):
37
+ if type(txt) != str:
38
+ raise TypeError("txt type should be str!")
39
  html_doc = readability.Document(txt)
40
  title = html_doc.title()
41
  content = html_text.extract_text(html_doc.summary(html_partial=True))
42
+ txt = f"{title}\n{content}"
43
  sections = txt.split("\n")
44
  return sections
deepdoc/parser/txt_parser.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+
14
+ from rag.nlp import find_codec,num_tokens_from_string
15
+
16
+ class RAGFlowTxtParser:
17
+ def __call__(self, fnm, binary=None, chunk_token_num=128):
18
+ txt = ""
19
+ if binary:
20
+ encoding = find_codec(binary)
21
+ txt = binary.decode(encoding, errors="ignore")
22
+ else:
23
+ with open(fnm, "r") as f:
24
+ while True:
25
+ l = f.readline()
26
+ if not l:
27
+ break
28
+ txt += l
29
+ return self.parser_txt(txt, chunk_token_num)
30
+
31
+ @classmethod
32
+ def parser_txt(cls, txt, chunk_token_num=128):
33
+ if type(txt) != str:
34
+ raise TypeError("txt type should be str!")
35
+ sections = []
36
+ for sec in txt.split("\n"):
37
+ if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
38
+ sections.append((sec[: int(len(sec) / 2)], ""))
39
+ sections.append((sec[int(len(sec) / 2) :], ""))
40
+ else:
41
+ sections.append((sec, ""))
42
+ return sections
rag/app/email.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+
14
+ from email import policy
15
+ from email.parser import BytesParser
16
+ from rag.app.naive import chunk as naive_chunk
17
+ import re
18
+ from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
19
+ from deepdoc.parser import HtmlParser, TxtParser
20
+ from timeit import default_timer as timer
21
+ from rag.settings import cron_logger
22
+ import io
23
+
24
+
25
+ def chunk(
26
+ filename,
27
+ binary=None,
28
+ from_page=0,
29
+ to_page=100000,
30
+ lang="Chinese",
31
+ callback=None,
32
+ **kwargs,
33
+ ):
34
+ """
35
+ Only eml is supported
36
+ """
37
+ eng = lang.lower() == "english" # is_english(cks)
38
+ parser_config = kwargs.get(
39
+ "parser_config",
40
+ {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
41
+ )
42
+ doc = {
43
+ "docnm_kwd": filename,
44
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
45
+ }
46
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
47
+ main_res = []
48
+ attachment_res = []
49
+
50
+ if binary:
51
+ msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
52
+ else:
53
+ msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
54
+
55
+ text_txt, html_txt = [], []
56
+ # get the email header info
57
+ for header, value in msg.items():
58
+ text_txt.append(f"{header}: {value}")
59
+
60
+ # get the email main info
61
+ def _add_content(msg, content_type):
62
+ if content_type == "text/plain":
63
+ text_txt.append(
64
+ msg.get_payload(decode=True).decode(msg.get_content_charset())
65
+ )
66
+ elif content_type == "text/html":
67
+ html_txt.append(
68
+ msg.get_payload(decode=True).decode(msg.get_content_charset())
69
+ )
70
+ elif "multipart" in content_type:
71
+ if msg.is_multipart():
72
+ for part in msg.iter_parts():
73
+ _add_content(part, part.get_content_type())
74
+
75
+ _add_content(msg, msg.get_content_type())
76
+
77
+ sections = TxtParser.parser_txt("\n".join(text_txt)) + [
78
+ (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
79
+ ]
80
+
81
+ st = timer()
82
+ chunks = naive_merge(
83
+ sections,
84
+ int(parser_config.get("chunk_token_num", 128)),
85
+ parser_config.get("delimiter", "\n!?。;!?"),
86
+ )
87
+
88
+ main_res.extend(tokenize_chunks(chunks, doc, eng, None))
89
+ cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
90
+ # get the attachment info
91
+ for part in msg.iter_attachments():
92
+ content_disposition = part.get("Content-Disposition")
93
+ if content_disposition:
94
+ dispositions = content_disposition.strip().split(";")
95
+ if dispositions[0].lower() == "attachment":
96
+ filename = part.get_filename()
97
+ payload = part.get_payload(decode=True)
98
+ try:
99
+ attachment_res.extend(
100
+ naive_chunk(filename, payload, callback=callback, **kwargs)
101
+ )
102
+ except Exception:
103
+ pass
104
+
105
+ return main_res + attachment_res
106
+
107
+
108
+ if __name__ == "__main__":
109
+ import sys
110
+
111
+ def dummy(prog=None, msg=""):
112
+ pass
113
+
114
+ chunk(sys.argv[1], callback=dummy)
rag/app/naive.py CHANGED
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
  from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
20
- from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
21
  from rag.settings import cron_logger
22
  from rag.utils import num_tokens_from_string
23
  from PIL import Image
@@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
170
  return sections, tbls
171
 
172
 
 
173
  def chunk(filename, binary=None, from_page=0, to_page=100000,
174
  lang="Chinese", callback=None, **kwargs):
175
  """
@@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
222
 
223
  elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
224
  callback(0.1, "Start to parse.")
225
- txt = ""
226
- if binary:
227
- encoding = find_codec(binary)
228
- txt = binary.decode(encoding, errors="ignore")
229
- else:
230
- with open(filename, "r") as f:
231
- while True:
232
- l = f.readline()
233
- if not l:
234
- break
235
- txt += l
236
- sections = []
237
- for sec in txt.split("\n"):
238
- if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
239
- sections.append((sec[:int(len(sec)/2)], ""))
240
- sections.append((sec[int(len(sec)/2):], ""))
241
- else:
242
- sections.append((sec, ""))
243
-
244
  callback(0.8, "Finish parsing.")
245
 
246
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
 
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
  from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
20
+ from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
21
  from rag.settings import cron_logger
22
  from rag.utils import num_tokens_from_string
23
  from PIL import Image
 
170
  return sections, tbls
171
 
172
 
173
+
174
  def chunk(filename, binary=None, from_page=0, to_page=100000,
175
  lang="Chinese", callback=None, **kwargs):
176
  """
 
223
 
224
  elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
225
  callback(0.1, "Start to parse.")
226
+ sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  callback(0.8, "Finish parsing.")
228
 
229
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
rag/svr/task_executor.py CHANGED
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
45
  from io import BytesIO
46
  import pandas as pd
47
 
48
- from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
49
 
50
  from api.db import LLMType, ParserType
51
  from api.db.services.document_service import DocumentService
@@ -69,6 +69,7 @@ FACTORY = {
69
  ParserType.PICTURE.value: picture,
70
  ParserType.ONE.value: one,
71
  ParserType.AUDIO.value: audio,
 
72
  ParserType.KG.value: knowledge_graph
73
  }
74
 
 
45
  from io import BytesIO
46
  import pandas as pd
47
 
48
+ from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
49
 
50
  from api.db import LLMType, ParserType
51
  from api.db.services.document_service import DocumentService
 
69
  ParserType.PICTURE.value: picture,
70
  ParserType.ONE.value: one,
71
  ParserType.AUDIO.value: audio,
72
+ ParserType.EMAIL.value: email,
73
  ParserType.KG.value: knowledge_graph
74
  }
75
 
web/src/components/chunk-method-modal/hooks.ts CHANGED
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
27
  'one',
28
  'qa',
29
  'manual',
30
- 'knowledge_graph',
31
  ],
32
  ],
33
  [
@@ -67,6 +67,7 @@ const ParserListMap = new Map([
67
  ],
68
  [['md'], ['naive', 'qa', 'knowledge_graph']],
69
  [['json'], ['naive', 'knowledge_graph']],
 
70
  ]);
71
 
72
  const getParserList = (
 
27
  'one',
28
  'qa',
29
  'manual',
30
+ 'knowledge_graph'
31
  ],
32
  ],
33
  [
 
67
  ],
68
  [['md'], ['naive', 'qa', 'knowledge_graph']],
69
  [['json'], ['naive', 'knowledge_graph']],
70
+ [['eml'], ['email']]
71
  ]);
72
 
73
  const getParserList = (