Advertisement
Guest User

Untitled

a guest
Aug 6th, 2012
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.71 KB | None | 0 0
  1. import urllib2
  2.  
  3. import html5lib
  4. from html5lib import treebuilders
  5. from lxml import etree
  6.  
  7. data = urllib2.urlopen('http://www.example.com').read()
  8.  
  9. parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
  10. etree_document = parser.parse(data)
  11.  
  12. print etree.tostring(etree_document,method='html')
  13.  
  14.  
  15. """
  16. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  17. <html:html xmlns:html="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml"><html:head>
  18.     <html:title>IANA &#8212; Example domains</html:title>
  19.     <!-- start common-head -->
  20.     <html:meta content="text/html; charset=utf-8" http-equiv="Content-type"></html:meta>
  21. """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement