Advertisement
Guest User

Web scraping efficiency

a guest
Dec 1st, 2011
47
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.82 KB | None | 0 0
  1. import re
  2. import time
  3. import urllib2
  4. from BeautifulSoup import BeautifulSoup
  5. from lxml import html as lxmlhtml
  6.  
  7.  
  8. def timeit(fn, *args):
  9.     t1 = time.time()
  10.     for i in range(100):
  11.         fn(*args)
  12.     t2 = time.time()
  13.     print '%s took %0.3f ms' % (fn.func_name, (t2-t1)*1000.0)
  14.    
  15.    
  16. def bs_test(html):
  17.     soup = BeautifulSoup(html)
  18.     return soup.html.head.title
  19.    
  20. def lxml_test(html):
  21.     tree = lxmlhtml.fromstring(html)
  22.     return tree.xpath('//title')[0].text_content()
  23.    
  24. def regex_test(html):
  25.     return re.findall('<title>(.*?)</title>', html)[0]
  26.    
  27.    
  28. if __name__ == '__main__':
  29.     sample_html = urllib2.urlopen('http://blog.sitescraper.net/2010/06/web-scraping-with-regular-expressions.html').read()
  30.     for fn in (bs_test, lxml_test, regex_test):
  31.         timeit(fn, sample_html)
  32.  
  33.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement