Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- from bs4.builder._lxml import LXML as BS_LXML_FEATURE
- class FragmentSoup(BeautifulSoup):
- """A custom BeautifulSoup implementation that properly represents fragments when using lxml, without wrapping them
- in a document on output (internally, they are still wrapped)."""
- def _feed(self, *args, **kwargs):
- if not hasattr(self, '__markup_enclosure'):
- self.__markup_enclosure = None
- if BS_LXML_FEATURE in self.builder.features and (not self.builder.is_xml) and not (
- self.markup.startswith('<html') or self.markup.startswith('<!doctype')):
- # Wrap in an enclosure -- if this is a <body> then don't include another
- if self.markup.startswith('<body'):
- self.markup = '<html>%s</html>' % self.markup
- self.__markup_enclosure = 'html'
- else:
- self.markup = '<html><body>%s</body></html>' % self.markup
- self.__markup_enclosure = 'body'
- return super(FragmentSoup, self)._feed(*args, **kwargs)
- def decode(self, *args, **kwargs):
- if self.__markup_enclosure is not None:
- targets = self.find(self.__markup_enclosure).children
- return ''.join(t.decode(*args, **kwargs) for t in targets)
- else:
- return super(FragmentSoup, self).decode(*args, **kwargs)
Add Comment
Please, Sign In to add comment