Advertisement
Guest User

enginn

a guest
Dec 10th, 2012
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.75 KB | None | 0 0
  1. from bs4 import BeautifulSoup, SoupStrainer
  2. import requests
  3. import re
  4.  
  5.  
  6. is_post = lambda x: 'orvitinn' in x and '2012' in x and not 'athugasemd' in x
  7.  
  8. links = list(set([link.get('href', '')
  9.         for link in
  10.         BeautifulSoup(
  11.             requests.get('http://www.orvitinn.com/').text,
  12.             parse_only=SoupStrainer('a'))
  13.         if is_post(link.get('href', ''))]))
  14.        
  15. names = set()
  16. for l in links:
  17.     for comment in BeautifulSoup(requests.get(l).text, parse_only=SoupStrainer('article')):
  18.         for name in comment.findAll('h1'):
  19.             if re.compile('\d:\d').search(name.text) > -1:
  20.                 names.add(name.text.split(' - ')[0].lower())
  21.                
  22. for n in names:
  23.     print n
  24.    
  25. print len(names)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement