Advertisement
andrew_s

revised version of ripper234's script

May 25th, 2011
219
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.86 KB | None | 0 0
  1. #!/usr/bin/python2.7
  2.  
  3. import re
  4. import sys
  5. import sys
  6. import datetime
  7.  
  8. from urllib2 import urlopen
  9.  
  10. # count how many lines in a file
  11. def file_len(fname):
  12.     with open(fname) as f:
  13.         for i, l in enumerate(f):
  14.             pass
  15.     return i + 1
  16.  
  17. input_file = './input.txt'
  18. input_file_length = file_len(input_file)
  19. f = open (input_file)
  20. list = []
  21.  
  22. print "Fetching pages..."
  23.  
  24. for i, line in enumerate(f):
  25.     sys.stdout.flush()
  26.     line = line.strip()
  27.     m = re.search('\* <(htt.*?)>(.*)$', line)
  28.     if (not m):
  29.         print "Failed to match line '{}'".format(line)
  30.     sys.exit()
  31.  
  32.     site = {}
  33.     site['url'],site['description'],site['questions'],site['users'],site['answers'] = m.group(1), m.group(2),0,0,0
  34.     print "{}/{} ({}) - Fetching data from {}".format(i+1, input_file_length, datetime.datetime.now(), site['url'])
  35.  
  36.     # go get one site
  37.     try:
  38.         html = urlopen(site['url'], timeout = 60).read()
  39.     except Exception as e:
  40.         print "Problem fetching URL {}: {}".format(site['url'], e)
  41.         continue
  42.  
  43.     # got the site. Search for the number of questions, and the number of answers, in it
  44.     m = re.search('<div class="questions-count">\s*(\d+)\s*<.*<div class="questions-count">\s*(\d+)\s*<', html)
  45.     # and get the page title, to use as a description, in case we don't already have a description for this site
  46.     m1 = re.search('<title>\s*(.+)\s*</title', html)
  47.     if (not m):
  48.     print "Failed to find question count for", site['url']
  49.     else:
  50.         # yay - we have a number of questions and answers - save them for this site
  51.     site['questions'] = int(m.group(1))
  52.     site['answers'] = int(m.group(2))
  53.     if ((site['description']=="") & (not not m1)):
  54.         site['description']=m1.group(1)
  55.    
  56.  
  57.     print "Found {} questions".format(site['questions'])
  58.    
  59.     try:
  60.         # now try to get the number of users. By default there are 35 users per page
  61.         html = urlopen(site['url']+'users/', timeout = 60).read()
  62.     m2 = re.search('>\s*(\d+)\s*</a><span class="next">', html)
  63.     if (not m2):
  64.         print "Failed to find 'next page' tag for {}, assuming just 1 page of users".format(site['url'])
  65.         site['users']=1
  66.     else:
  67.         site['users']=int(m2.group(1))
  68.        
  69.     print "Found {} pages of users".format(site['users'])
  70.    
  71.     except Exception as e:
  72.         print "Problem fetching URL {}/users/: {}".format(site['url'], e)
  73.         site['users']=0
  74.    
  75.     list.append(site)
  76.  
  77. # and now we've cycled through all our sites
  78. print
  79. print "The sorted list:"
  80. print
  81.  
  82. # sort them into descending order by number of questions
  83. list.sort(key = lambda x:x['questions'])
  84. list.reverse()
  85. # and finally, output the list of sites
  86. for site in list:
  87.     print " * <{}> ({} questions, ".format(site['url'], site['questions']) + "{}-{} users".format(site['users']*35-34,site['users']*35) + ") {}".format(site['description'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement