Spaces:

retopara
/

ragflow

Build error

App Files Files Community

黄腾

aopstudio Kevin Hu commited on Aug 6, 2024

Commit

6ed07a9

1 Parent(s): 29fdf3e

add support for eml file parser (#1768)

Browse files

### What problem does this PR solve?

add support for eml file parser
#1363

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <[email protected]>
Co-authored-by: Kevin Hu <[email protected]>

Files changed (12) hide show

api/apps/dataset_api.py +3 -1
api/db/__init__.py +1 -0
api/db/init_data.py +1 -1
api/settings.py +1 -1
api/utils/file_utils.py +1 -1
deepdoc/parser/__init__.py +2 -1
deepdoc/parser/html_parser.py +6 -1
deepdoc/parser/txt_parser.py +42 -0
rag/app/email.py +114 -0
rag/app/naive.py +3 -20
rag/svr/task_executor.py +2 -1
web/src/components/chunk-method-modal/hooks.ts +2 -1

api/apps/dataset_api.py CHANGED Viewed

@@ -39,7 +39,7 @@ from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
-from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
@@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
             table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
         case "audio":
             audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
         case _:
             return False

 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
+from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
             table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
         case "audio":
             audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
+        case "email":
+            email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
         case _:
             return False

api/db/__init__.py CHANGED Viewed

@@ -85,6 +85,7 @@ class ParserType(StrEnum):
     PICTURE = "picture"
     ONE = "one"
     AUDIO = "audio"
     KG = "knowledge_graph"

     PICTURE = "picture"
     ONE = "one"
     AUDIO = "audio"
+    EMAIL = "email"
     KG = "knowledge_graph"

api/db/init_data.py CHANGED Viewed

@@ -122,7 +122,7 @@ def init_llm_factory():
     LLMService.filter_delete([LLMService.model.fid == "QAnything"])
     TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
     TenantService.filter_update([1 == 1], {
-        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
     ## insert openai two embedding models to the current openai user.
     print("Start to insert 2 OpenAI embedding models...")
     tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])

     LLMService.filter_delete([LLMService.model.fid == "QAnything"])
     TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
     TenantService.filter_update([1 == 1], {
+        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
     ## insert openai two embedding models to the current openai user.
     print("Start to insert 2 OpenAI embedding models...")
     tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])

api/settings.py CHANGED Viewed

@@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
     "parsers",
-    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
     "parsers",
+    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")
 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

api/utils/file_utils.py CHANGED Viewed

@@ -156,7 +156,7 @@ def filename_type(filename):
         return FileType.PDF.value
     if re.match(
-             r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
         return FileType.DOC.value
     if re.match(

         return FileType.PDF.value
     if re.match(
+             r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
         return FileType.DOC.value
     if re.match(

deepdoc/parser/__init__.py CHANGED Viewed

@@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
-from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser

 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
+from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
+from .txt_parser import RAGFlowTxtParser as TxtParser

deepdoc/parser/html_parser.py CHANGED Viewed

@@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
         else:
             with open(fnm, "r",encoding=get_encoding(fnm)) as f:
                 txt = f.read()
         html_doc = readability.Document(txt)
         title = html_doc.title()
         content = html_text.extract_text(html_doc.summary(html_partial=True))
-        txt = f'{title}\n{content}'
         sections = txt.split("\n")
         return sections

         else:
             with open(fnm, "r",encoding=get_encoding(fnm)) as f:
                 txt = f.read()
+        return self.parser_txt(txt)
+    @classmethod
+    def parser_txt(cls, txt):
+        if type(txt) != str:
+            raise TypeError("txt type should be str!")
         html_doc = readability.Document(txt)
         title = html_doc.title()
         content = html_text.extract_text(html_doc.summary(html_partial=True))
+        txt = f"{title}\n{content}"
         sections = txt.split("\n")
         return sections

deepdoc/parser/txt_parser.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from rag.nlp import find_codec,num_tokens_from_string
+class RAGFlowTxtParser:
+    def __call__(self, fnm, binary=None, chunk_token_num=128):
+        txt = ""
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(fnm, "r") as f:
+                while True:
+                    l = f.readline()
+                    if not l:
+                        break
+                    txt += l
+        return self.parser_txt(txt, chunk_token_num)
+    @classmethod
+    def parser_txt(cls, txt, chunk_token_num=128):
+        if type(txt) != str:
+            raise TypeError("txt type should be str!")
+        sections = []
+        for sec in txt.split("\n"):
+            if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
+                sections.append((sec[: int(len(sec) / 2)], ""))
+                sections.append((sec[int(len(sec) / 2) :], ""))
+            else:
+                sections.append((sec, ""))
+        return sections

rag/app/email.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from email import policy
+from email.parser import BytesParser
+from rag.app.naive import chunk as naive_chunk
+import re
+from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
+from deepdoc.parser import HtmlParser, TxtParser
+from timeit import default_timer as timer
+from rag.settings import cron_logger
+import io
+def chunk(
+    filename,
+    binary=None,
+    from_page=0,
+    to_page=100000,
+    lang="Chinese",
+    callback=None,
+    **kwargs,
+):
+    """
+    Only eml is supported
+    """
+    eng = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config",
+        {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True},
+    )
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    main_res = []
+    attachment_res = []
+    if binary:
+        msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
+    else:
+        msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
+    text_txt, html_txt = [], []
+    # get the email header info
+    for header, value in msg.items():
+        text_txt.append(f"{header}: {value}")
+    #  get the email main info
+    def _add_content(msg, content_type):
+        if content_type == "text/plain":
+            text_txt.append(
+                msg.get_payload(decode=True).decode(msg.get_content_charset())
+            )
+        elif content_type == "text/html":
+            html_txt.append(
+                msg.get_payload(decode=True).decode(msg.get_content_charset())
+            )
+        elif "multipart" in content_type:
+            if msg.is_multipart():
+                for part in msg.iter_parts():
+                    _add_content(part, part.get_content_type())
+    _add_content(msg, msg.get_content_type())
+    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
+        (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
+    ]
+    st = timer()
+    chunks = naive_merge(
+        sections,
+        int(parser_config.get("chunk_token_num", 128)),
+        parser_config.get("delimiter", "\n!?。；！？"),
+    )
+    main_res.extend(tokenize_chunks(chunks, doc, eng, None))
+    cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
+    # get the attachment info
+    for part in msg.iter_attachments():
+        content_disposition = part.get("Content-Disposition")
+        if content_disposition:
+            dispositions = content_disposition.strip().split(";")
+            if dispositions[0].lower() == "attachment":
+                filename = part.get_filename()
+                payload = part.get_payload(decode=True)
+                try:
+                    attachment_res.extend(
+                        naive_chunk(filename, payload, callback=callback, **kwargs)
+                    )
+                except Exception:
+                    pass
+    return main_res + attachment_res
+if __name__ == "__main__":
+    import sys
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], callback=dummy)

rag/app/naive.py CHANGED Viewed

@@ -17,7 +17,7 @@ from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
-from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
 from PIL import Image
@@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
         return sections, tbls
 def chunk(filename, binary=None, from_page=0, to_page=100000,
           lang="Chinese", callback=None, **kwargs):
     """
@@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
-        txt = ""
-        if binary:
-            encoding = find_codec(binary)
-            txt = binary.decode(encoding, errors="ignore")
-        else:
-            with open(filename, "r") as f:
-                while True:
-                    l = f.readline()
-                    if not l:
-                        break
-                    txt += l
-        sections = []
-        for sec in txt.split("\n"):
-            if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
-                sections.append((sec[:int(len(sec)/2)], ""))
-                sections.append((sec[int(len(sec)/2):], ""))
-            else:
-                sections.append((sec, ""))
         callback(0.8, "Finish parsing.")
     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):

 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
 from PIL import Image
         return sections, tbls
 def chunk(filename, binary=None, from_page=0, to_page=100000,
           lang="Chinese", callback=None, **kwargs):
     """
     elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
+        sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
         callback(0.8, "Finish parsing.")
     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):

rag/svr/task_executor.py CHANGED Viewed

@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
 from io import BytesIO
 import pandas as pd
-from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
@@ -69,6 +69,7 @@ FACTORY = {
     ParserType.PICTURE.value: picture,
     ParserType.ONE.value: one,
     ParserType.AUDIO.value: audio,
     ParserType.KG.value: knowledge_graph
 }

 from io import BytesIO
 import pandas as pd
+from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
     ParserType.PICTURE.value: picture,
     ParserType.ONE.value: one,
     ParserType.AUDIO.value: audio,
+    ParserType.EMAIL.value: email,
     ParserType.KG.value: knowledge_graph
 }

web/src/components/chunk-method-modal/hooks.ts CHANGED Viewed

@@ -27,7 +27,7 @@ const ParserListMap = new Map([
       'one',
       'qa',
       'manual',
-      'knowledge_graph',
     ],
   ],
   [
@@ -67,6 +67,7 @@ const ParserListMap = new Map([
   ],
   [['md'], ['naive', 'qa', 'knowledge_graph']],
   [['json'], ['naive', 'knowledge_graph']],
 ]);
 const getParserList = (

       'one',
       'qa',
       'manual',
+      'knowledge_graph'
     ],
   ],
   [
   ],
   [['md'], ['naive', 'qa', 'knowledge_graph']],
   [['json'], ['naive', 'knowledge_graph']],
+  [['eml'], ['email']]
 ]);
 const getParserList = (