Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python2.7
- import re
- import sys
- import sys
- import datetime
- from urllib2 import urlopen
- # count how many lines in a file
- def file_len(fname):
- with open(fname) as f:
- for i, l in enumerate(f):
- pass
- return i + 1
- input_file = './input.txt'
- input_file_length = file_len(input_file)
- f = open (input_file)
- list = []
- print "Fetching pages..."
- for i, line in enumerate(f):
- sys.stdout.flush()
- line = line.strip()
- m = re.search('\* <(htt.*?)>(.*)$', line)
- if (not m):
- print "Failed to match line '{}'".format(line)
- sys.exit()
- site = {}
- site['url'],site['description'],site['questions'],site['users'],site['answers'] = m.group(1), m.group(2),0,0,0
- print "{}/{} ({}) - Fetching data from {}".format(i+1, input_file_length, datetime.datetime.now(), site['url'])
- # go get one site
- try:
- html = urlopen(site['url'], timeout = 60).read()
- except Exception as e:
- print "Problem fetching URL {}: {}".format(site['url'], e)
- continue
- # got the site. Search for the number of questions, and the number of answers, in it
- m = re.search('<div class="questions-count">\s*(\d+)\s*<.*<div class="questions-count">\s*(\d+)\s*<', html)
- # and get the page title, to use as a description, in case we don't already have a description for this site
- m1 = re.search('<title>\s*(.+)\s*</title', html)
- if (not m):
- print "Failed to find question count for", site['url']
- else:
- # yay - we have a number of questions and answers - save them for this site
- site['questions'] = int(m.group(1))
- site['answers'] = int(m.group(2))
- if ((site['description']=="") & (not not m1)):
- site['description']=m1.group(1)
- print "Found {} questions".format(site['questions'])
- try:
- # now try to get the number of users. By default there are 35 users per page
- html = urlopen(site['url']+'users/', timeout = 60).read()
- m2 = re.search('>\s*(\d+)\s*</a><span class="next">', html)
- if (not m2):
- print "Failed to find 'next page' tag for {}, assuming just 1 page of users".format(site['url'])
- site['users']=1
- else:
- site['users']=int(m2.group(1))
- print "Found {} pages of users".format(site['users'])
- except Exception as e:
- print "Problem fetching URL {}/users/: {}".format(site['url'], e)
- site['users']=0
- list.append(site)
- # and now we've cycled through all our sites
- print
- print "The sorted list:"
- print
- # sort them into descending order by number of questions
- list.sort(key = lambda x:x['questions'])
- list.reverse()
- # and finally, output the list of sites
- for site in list:
- print " * <{}> ({} questions, ".format(site['url'], site['questions']) + "{}-{} users".format(site['users']*35-34,site['users']*35) + ") {}".format(site['description'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement