Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import html5lib
- from html5lib import treebuilders
- from lxml import etree
- data = urllib2.urlopen('http://www.example.com').read()
- parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
- etree_document = parser.parse(data)
- print etree.tostring(etree_document,method='html')
- """
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html:html xmlns:html="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml"><html:head>
- <html:title>IANA — Example domains</html:title>
- <!-- start common-head -->
- <html:meta content="text/html; charset=utf-8" http-equiv="Content-type"></html:meta>
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement