ripper234

OSQA site sorter

Mar 19th, 2011
105
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import re
  2. import sys
  3. import sys
  4. import datetime
  5.  
  6. from urllib2 import urlopen
  7.  
  8. def file_len(fname):
  9.     with open(fname) as f:
  10.         for i, l in enumerate(f):
  11.             pass
  12.     return i + 1
  13.  
  14. input_file = 'c:\\tmp\\input.txt'
  15. input_file_length = file_len(input_file)
  16. f = open (input_file)
  17. list = []
  18.  
  19. print "Fetching pages..."
  20.  
  21. for i, line in enumerate(f):
  22.     sys.stdout.flush()
  23.     line = line.strip()
  24.     m = re.search('\* <(http://.*?)>(?: - )?(.*)', line)
  25.     if (not m):
  26.         print "Failed to match line '{}'".format(line)
  27.     sys.exit()
  28.  
  29.     site = {}
  30.     site['url'], site['description'] = m.group(1), m.group(2)
  31.     print "{}/{} ({}) - Fetching data from {}".format(i+1, input_file_length, datetime.datetime.now(), site['url'])
  32.  
  33.     try:
  34.         html = urlopen(site['url'], timeout = 5).read()
  35.     except Exception as e:
  36.         print "Problem fetching URL {}: {}".format(site['url'], e)
  37.     continue
  38.  
  39.     m = re.search('<div class="questions-count">\s*(\d+)\s*<', html)
  40.     if (not m):
  41.     print "Failed to find question count for", site['url']
  42.     continue
  43.  
  44.     site['questions'] = int(m.group(1))
  45.  
  46.     print "Found {} questions".format(site['questions'])
  47.     list.append(site)
  48.  
  49. print
  50. print "The sorted list:"
  51. print
  52.  
  53. list.sort(key = lambda x:x['questions'])
  54. list.reverse()
  55. for site in list:
  56.     print " * <{}>".format(site['url']) + (" - {}".format(site['description']) if site['description'] != "" else "")
RAW Paste Data