Untitled

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib2
from retry import retry
try:
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

if len(sys.argv) < 2:
	print 'Usage:',sys.argv[0],'file_with_subforums_urls [file_to_output_raw_data_to]'
	sys.exit()

topics = []
posts = []
stats = {}

with open(sys.argv[1]) as f:
    forums = f.readlines()
forums = [x.strip() for x in forums]

print 'Got',len(forums),'forums to process'

@retry(urllib2.URLError,tries=3,delay=2,backoff=2)
def urlopen_with_retry(url):
    return urllib2.urlopen(url)

for forum in forums:
	next_page = forum
	print "Processing",forum
	while next_page:
		try:
			response = urlopen_with_retry(next_page)
			page = response.read()
			parsed_page = BeautifulSoup(page,'lxml')
			next_page = ''
			for link in parsed_page.body.find_all('a'):
				if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td'  and link.parent.parent.get('class')[0] == 'gensmall':
					next_page = forum.split('?')[0] + '?' + link['href'].split('?')[1]
			for link in parsed_page.body.find_all('a',class_="topictitle"):
				if link.has_attr('href') and ('viewtopic.php' in link['href']) and link.parent.name == 'td' and link.parent.get('class')[0] == 'row1':
					topics.append(forum.split('?')[0].replace('viewforum','viewtopic') + '?' + link['href'].split('?')[1])
		except:
			print "Skipped due to error in",forum
			next_page = ''

print 'Got',len(topics),'topics to process'

for topic in topics:
	next_page = topic
	print "Processing",topic
	while next_page:
		try:
			response = urlopen_with_retry(next_page)
			page = response.read()
			parsed_page = BeautifulSoup(page,'lxml')
			next_page = ''
			for link in parsed_page.body.find_all('a'):
				if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td'  and link.parent.parent.get('class')[0] == 'gensmall':
					next_page = topic.split('?')[0] + '?' + link['href'].split('?')[1]
			for td in parsed_page.body.find_all('td',class_="gensmall"):
				for child in td.children:
					if child.name == 'div' and child.has_attr('style') and child['style'] == 'float: right;':
						posts.append(child.text.replace('Posted: ',''))
		except:
			print "Skipped due to error in",topic
			next_page = ''

print 'Got',len(posts),'posts'

if len(sys.argv) == 3:
	output = open(sys.argv[2],'w')
	for post in posts:
		output.write("%s\n" % post)

for post in posts:
	post = post.split(' ')
	if not stats.has_key(post[3]):
		stats[post[3]] = 0
	stats[post[3]] += 1

print "Results as tsv:"

for key in sorted(stats.iterkeys()):
    print "%s\t%s" % (key,stats[key])