Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs
- import urllib2
- import re
- req = urllib2.Request('http://en.wikipedia.org/wiki/Bach')
- page = urllib2.urlopen(req)
- page = page.read()
- def find_pages (x):
- y = re.findall('href="/wiki/([^ ]*)"', x)
- return list(set (y))
- def find_organ (x):
- y = re.findall('>[^<^>]*[Oo]+[Rr]+[Gg]+[Aa]+[Nn]+[Ss]*[^<^>]*<', x)
- return len(y)
- for i in find_pages(page):
- counter = 0
- req2 = urllib2.Request('http://en.wikipedia.org/wiki/' + 'i')
- page2 = urllib2.urlopen(req2)
- page2 = page2.read()
- counter += find_organ (page2)
- print counter
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement