ragflow / deepdoc /parser /html_parser.py
aopstudio's picture
Add support for HTML file (#973)
858916d
raw
history blame
815 Bytes
# -*- coding: utf-8 -*-
from rag.nlp import find_codec
import readability
import html_text
import chardet
def get_encoding(file):
with open(file,'rb') as f:
tmp = chardet.detect(f.read())
return tmp['encoding']
class RAGFlowHtmlParser:
def __call__(self, fnm, binary=None):
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
txt = f.read()
html_doc = readability.Document(txt)
title = html_doc.title()
content = html_text.extract_text(html_doc.summary(html_partial=True))
txt = f'{title}\n{content}'
sections = txt.split("\n")
return sections