maurobaraldi

hackershelf.com crawler hacking

Mar 7th, 2012
153
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.69 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. __author__ = 'Mauro Baraldi'
  4. __email__ = 'mauro.baraldi@gmail.com'
  5.  
  6. import codecs
  7. import urllib
  8. import BeautifulSoup
  9.  
  10. url = 'http://hackershelf.com/book/%i/'
  11. path = '/home/mauro/tmp/hacker_bookshelf.txt'
  12.  
  13. if __name__ == '__main__':
  14.     for i in range(1,200):
  15.         page = BeautifulSoup.BeautifulSoup(urllib.urlopen(url % i).read())
  16.         try:
  17.             title = page.find('h1', {'id':'book_title'}).text
  18.             link = page.find('a', {'class':'primary_link'}).text
  19.             with codecs.open(path,'a', encoding='utf8') as bookshelf:
  20.                 bookshelf.write('%i;%s;%s\n' % (i, title, link))
  21.         except AttributeError:
  22.             pass
Add Comment
Please, Sign In to add comment