Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup, SoupStrainer
- import requests
- import re
- is_post = lambda x: 'orvitinn' in x and '2012' in x and not 'athugasemd' in x
- links = list(set([link.get('href', '')
- for link in
- BeautifulSoup(
- requests.get('http://www.orvitinn.com/').text,
- parse_only=SoupStrainer('a'))
- if is_post(link.get('href', ''))]))
- names = set()
- for l in links:
- for comment in BeautifulSoup(requests.get(l).text, parse_only=SoupStrainer('article')):
- for name in comment.findAll('h1'):
- if re.compile('\d:\d').search(name.text) > -1:
- names.add(name.text.split(' - ')[0].lower())
- for n in names:
- print n
- print len(names)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement