Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import pickle, sys
- from collections import Counter
- pages = pickle.load(open(sys.argv[1], 'rb'))
- counts = Counter()
- for url, src in pages.items():
- page = BeautifulSoup(src, "html.parser")
- for p in page.select('.display_name a'):
- if p.get('href'):
- counts[p['href']] += 1
- ret = sorted([(c, url) for url,c in counts.items()])
- for c, url in ret:
- print(url, c)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement