Spaces:
Sleeping
Sleeping
File size: 2,116 Bytes
592f71e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from html.parser import HTMLParser
class WebParser(HTMLParser):
"""
A class for converting the tagged html to formats that can be used by a ML model
"""
def __init__(self):
super().__init__()
self.block_tags = {
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
}
self.inline_tags = {
'', 'a', 'b', 'main', 'span', 'em', 'strong', 'br'
}
self.allowed_tags = {'div', 'p', '', 'a', 'b', 'main', 'span', 'em', 'strong', 'br'}
self.opened_tags = []
self.block_content = ''
self.blocks = []
def get_last_opened_tag(self):
"""
Gets the last visited tag
:return:
"""
if len(self.opened_tags) > 0:
return self.opened_tags[len(self.opened_tags) - 1]
return ''
def error(self, message):
pass
def handle_starttag(self, tag, attrs):
"""
Handles the start tag of an HTML node in the tree
:param tag: the HTML tag
:param attrs: the tag attributes
:return:
"""
self.opened_tags.append(tag)
def handle_endtag(self, tag):
"""
Handles the end tag of an HTML node in the tree
:param tag: the HTML tag
:return:
"""
if tag in self.block_tags:
self.block_content = self.block_content.strip()
if len(self.block_content) > 0:
#if not self.block_content.endswith('.'): self.block_content += '.'
self.blocks.append(self.block_content)
self.block_content = ''
if len(self.opened_tags) > 0:
self.opened_tags.pop()
def handle_data(self, data):
"""
Handles a text HTML node in the tree
:param data: the text node
:return:
"""
last_opened_tag = self.get_last_opened_tag()
if last_opened_tag in self.allowed_tags:
data = data.replace(' ', ' ').strip()
if data != '':
self.block_content += data + ' '
def get_blocks(self):
return self.blocks
|