Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import unicodedata
- from html.parser import HTMLParser
- class MyHTMLParser(HTMLParser):
- CDATA_CONTENT_ELEMENTS = ("script", "style", "code")
- def __init__(self, strict=False):
- super(MyHTMLParser, self).__init__()
- self.laststarttag = None
- self.secondtolaststarttag = None
- self.codeblocks = []
- self.textblock = ""
- def remove_accents(self, input_str):
- nkfd_form = unicodedata.normalize('NFKD', input_str)
- return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
- def get_code(self):
- rcodeblocks = []
- for codeblock in self.codeblocks:
- codeblock = codeblock.replace("<br>", "\n").replace(">", ">").replace("<", "<")
- codeblock = self.remove_accents(codeblock)
- codeblock = codeblock.encode('ascii', 'ignore')
- rcodeblocks.append(codeblock.decode())
- self.codeblocks = []
- return rcodeblocks
- def get_text(self):
- textblock = self.remove_accents(self.textblock)
- textblock = textblock.encode('ascii', 'ignore').decode()
- self.textblock = ""
- return textblock
- def handle_starttag(self, tag, attrs):
- self.secondtolaststarttag = self.laststarttag
- self.laststarttag = tag
- def handle_endtag(self, tag):
- self.secondtolaststarttag = None
- self.laststarttag = None
- def handle_data(self, data):
- if self.secondtolaststarttag == "pre" and self.laststarttag == "code":
- self.codeblocks.append(data)
- else:
- if not self.textblock:
- self.textblock = data
- else:
- self.textblock += "\n\n" + data
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement