Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import sys
- import sys
- import datetime
- from urllib2 import urlopen
- def file_len(fname):
- with open(fname) as f:
- for i, l in enumerate(f):
- pass
- return i + 1
- input_file = 'c:\\tmp\\input.txt'
- input_file_length = file_len(input_file)
- f = open (input_file)
- list = []
- print "Fetching pages..."
- for i, line in enumerate(f):
- sys.stdout.flush()
- line = line.strip()
- m = re.search('\* <(http://.*?)>(?: - )?(.*)', line)
- if (not m):
- print "Failed to match line '{}'".format(line)
- sys.exit()
- site = {}
- site['url'], site['description'] = m.group(1), m.group(2)
- print "{}/{} ({}) - Fetching data from {}".format(i+1, input_file_length, datetime.datetime.now(), site['url'])
- try:
- html = urlopen(site['url'], timeout = 5).read()
- except Exception as e:
- print "Problem fetching URL {}: {}".format(site['url'], e)
- continue
- m = re.search('<div class="questions-count">\s*(\d+)\s*<', html)
- if (not m):
- print "Failed to find question count for", site['url']
- continue
- site['questions'] = int(m.group(1))
- print "Found {} questions".format(site['questions'])
- list.append(site)
- print
- print "The sorted list:"
- print
- list.sort(key = lambda x:x['questions'])
- list.reverse()
- for site in list:
- print " * <{}>".format(site['url']) + (" - {}".format(site['description']) if site['description'] != "" else "")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement