Advertisement
ripper234

OSQA site sorter

Mar 19th, 2011
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.43 KB | None | 0 0
  1. import re
  2. import sys
  3. import sys
  4. import datetime
  5.  
  6. from urllib2 import urlopen
  7.  
  8. def file_len(fname):
  9.     with open(fname) as f:
  10.         for i, l in enumerate(f):
  11.             pass
  12.     return i + 1
  13.  
  14. input_file = 'c:\\tmp\\input.txt'
  15. input_file_length = file_len(input_file)
  16. f = open (input_file)
  17. list = []
  18.  
  19. print "Fetching pages..."
  20.  
  21. for i, line in enumerate(f):
  22.     sys.stdout.flush()
  23.     line = line.strip()
  24.     m = re.search('\* <(http://.*?)>(?: - )?(.*)', line)
  25.     if (not m):
  26.         print "Failed to match line '{}'".format(line)
  27.     sys.exit()
  28.  
  29.     site = {}
  30.     site['url'], site['description'] = m.group(1), m.group(2)
  31.     print "{}/{} ({}) - Fetching data from {}".format(i+1, input_file_length, datetime.datetime.now(), site['url'])
  32.  
  33.     try:
  34.         html = urlopen(site['url'], timeout = 5).read()
  35.     except Exception as e:
  36.         print "Problem fetching URL {}: {}".format(site['url'], e)
  37.     continue
  38.  
  39.     m = re.search('<div class="questions-count">\s*(\d+)\s*<', html)
  40.     if (not m):
  41.     print "Failed to find question count for", site['url']
  42.     continue
  43.  
  44.     site['questions'] = int(m.group(1))
  45.  
  46.     print "Found {} questions".format(site['questions'])
  47.     list.append(site)
  48.  
  49. print
  50. print "The sorted list:"
  51. print
  52.  
  53. list.sort(key = lambda x:x['questions'])
  54. list.reverse()
  55. for site in list:
  56.     print " * <{}>".format(site['url']) + (" - {}".format(site['description']) if site['description'] != "" else "")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement