Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import time
- import urllib2
- from BeautifulSoup import BeautifulSoup
- from lxml import html as lxmlhtml
- def timeit(fn, *args):
- t1 = time.time()
- for i in range(100):
- fn(*args)
- t2 = time.time()
- print '%s took %0.3f ms' % (fn.func_name, (t2-t1)*1000.0)
- def bs_test(html):
- soup = BeautifulSoup(html)
- return soup.html.head.title
- def lxml_test(html):
- tree = lxmlhtml.fromstring(html)
- return tree.xpath('//title')[0].text_content()
- def regex_test(html):
- return re.findall('<title>(.*?)</title>', html)[0]
- if __name__ == '__main__':
- sample_html = urllib2.urlopen('http://blog.sitescraper.net/2010/06/web-scraping-with-regular-expressions.html').read()
- for fn in (bs_test, lxml_test, regex_test):
- timeit(fn, sample_html)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement