Advertisement
Guest User

Untitled

a guest
Jul 27th, 2016
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.41 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import pickle, sys
  3. from collections import Counter
  4. pages = pickle.load(open(sys.argv[1], 'rb'))
  5.  
  6. counts = Counter()
  7.  
  8. for url, src in pages.items():
  9. page = BeautifulSoup(src, "html.parser")
  10. for p in page.select('.display_name a'):
  11. if p.get('href'):
  12. counts[p['href']] += 1
  13.  
  14. ret = sorted([(c, url) for url,c in counts.items()])
  15. for c, url in ret:
  16. print(url, c)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement