Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2016
230
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.46 KB | None | 0 0
  1. import unicodedata
  2. from html.parser import HTMLParser
  3.  
  4. class MyHTMLParser(HTMLParser):
  5. CDATA_CONTENT_ELEMENTS = ("script", "style", "code")
  6. def __init__(self, strict=False):
  7. super(MyHTMLParser, self).__init__()
  8. self.laststarttag = None
  9. self.secondtolaststarttag = None
  10. self.codeblocks = []
  11. self.textblock = ""
  12. def remove_accents(self, input_str):
  13. nkfd_form = unicodedata.normalize('NFKD', input_str)
  14. return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
  15. def get_code(self):
  16. rcodeblocks = []
  17. for codeblock in self.codeblocks:
  18. codeblock = codeblock.replace("<br>", "\n").replace("&gt;", ">").replace("&lt;", "<")
  19. codeblock = self.remove_accents(codeblock)
  20. codeblock = codeblock.encode('ascii', 'ignore')
  21. rcodeblocks.append(codeblock.decode())
  22. self.codeblocks = []
  23. return rcodeblocks
  24. def get_text(self):
  25. textblock = self.remove_accents(self.textblock)
  26. textblock = textblock.encode('ascii', 'ignore').decode()
  27. self.textblock = ""
  28. return textblock
  29. def handle_starttag(self, tag, attrs):
  30. self.secondtolaststarttag = self.laststarttag
  31. self.laststarttag = tag
  32. def handle_endtag(self, tag):
  33. self.secondtolaststarttag = None
  34. self.laststarttag = None
  35. def handle_data(self, data):
  36. if self.secondtolaststarttag == "pre" and self.laststarttag == "code":
  37. self.codeblocks.append(data)
  38. else:
  39. if not self.textblock:
  40. self.textblock = data
  41. else:
  42. self.textblock += "\n\n" + data
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement