Spaces:

retopara
/

ragflow

Build error

ragflow / deepdoc /parser /html_parser.py

Add support for HTML file (#973)

858916d about 1 year ago

815 Bytes

	# -- coding: utf-8 --
	from rag.nlp import find_codec
	import readability
	import html_text
	import chardet

	def get_encoding(file):
	with open(file,'rb') as f:
	tmp = chardet.detect(f.read())
	return tmp['encoding']

	class RAGFlowHtmlParser:
	def __call__(self, fnm, binary=None):
	txt = ""
	if binary:
	encoding = find_codec(binary)
	txt = binary.decode(encoding, errors="ignore")
	else:
	with open(fnm, "r",encoding=get_encoding(fnm)) as f:
	txt = f.read()

	html_doc = readability.Document(txt)
	title = html_doc.title()
	content = html_text.extract_text(html_doc.summary(html_partial=True))
	txt = f'{title}\n{content}'
	sections = txt.split("\n")
	return sections