Advertisement
irenicus09

Beautiful Soup Example

Jun 2nd, 2011
2,427
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.98 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. __author__ = 'Irenicus09'
  4. __date__ = '2nd June 2011'
  5.  
  6.  
  7. ####################################################################################
  8. #                                                                                  #
  9. # This script has been released as an example for learning Beautiful Soup Module.  #
  10. #                                                                                  #
  11. # The author takes no liability for the use of this script in any way,             #
  12. # and it is to be used for educational purposes only.                              #
  13. #                                                                                  #
  14. ####################################################################################
  15.  
  16.  
  17.  
  18. import urllib2
  19. from BeautifulSoup import BeautifulSoup
  20.  
  21.    
  22. def scrapePythonSection():
  23.     """
  24.    This function grabs the first page of the python section @Intern0t forum.
  25.    The data is then parsed using BeautifulSoup to look for title and meta data.
  26.    """
  27.    
  28.    
  29.     ua = 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20110506 Firefox/4.0.1'
  30.     req = urllib2.Request('http://forum.intern0t.net/perl-python/')
  31.     req.add_header('User-Agent', ua)
  32.    
  33.     try:
  34.         html = (urllib2.urlopen(req)).read()
  35.     except (BaseException):
  36.         print '[!] Error Occured. '
  37.         print '[?] Check whether system is Online.'
  38.         exit(1)
  39.  
  40.     soup = BeautifulSoup(html)
  41.  
  42.     search = soup.findAll('div', attrs={'class' : 'inner'})
  43.    
  44.     index = 1
  45.  
  46.     for base in search:
  47.         title = base.find('h3', attrs={'class' : 'threadtitle'}).a.string
  48.         details = base.find('div', attrs={'class' : 'author'}).span.a['title']
  49.         print '%d. %s >> %s' % (index, title, details)
  50.         index += 1
  51.    
  52.     print '\n\n'
  53.  
  54.    
  55. def printTitle():
  56.     print '\n\t<<< COOKING WITH BEAUTIFUL SOUP >>>'
  57.     print '\n'
  58.  
  59. if __name__ == '__main__':
  60.     printTitle()
  61.     scrapePythonSection()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement