Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- import urllib2
- from retry import retry
- try:
- from BeautifulSoup import BeautifulSoup
- except ImportError:
- from bs4 import BeautifulSoup
- if len(sys.argv) < 2:
- print 'Usage:',sys.argv[0],'file_with_subforums_urls [file_to_output_raw_data_to]'
- sys.exit()
- topics = []
- posts = []
- stats = {}
- with open(sys.argv[1]) as f:
- forums = f.readlines()
- forums = [x.strip() for x in forums]
- print 'Got',len(forums),'forums to process'
- @retry(urllib2.URLError,tries=3,delay=2,backoff=2)
- def urlopen_with_retry(url):
- return urllib2.urlopen(url)
- for forum in forums:
- next_page = forum
- print "Processing",forum
- while next_page:
- try:
- response = urlopen_with_retry(next_page)
- page = response.read()
- parsed_page = BeautifulSoup(page,'lxml')
- next_page = ''
- for link in parsed_page.body.find_all('a'):
- if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall':
- next_page = forum.split('?')[0] + '?' + link['href'].split('?')[1]
- for link in parsed_page.body.find_all('a',class_="topictitle"):
- if link.has_attr('href') and ('viewtopic.php' in link['href']) and link.parent.name == 'td' and link.parent.get('class')[0] == 'row1':
- topics.append(forum.split('?')[0].replace('viewforum','viewtopic') + '?' + link['href'].split('?')[1])
- except:
- print "Skipped due to error in",forum
- next_page = ''
- print 'Got',len(topics),'topics to process'
- for topic in topics:
- next_page = topic
- print "Processing",topic
- while next_page:
- try:
- response = urlopen_with_retry(next_page)
- page = response.read()
- parsed_page = BeautifulSoup(page,'lxml')
- next_page = ''
- for link in parsed_page.body.find_all('a'):
- if (not next_page) and link.text == "Next" and link.parent.name == 'b' and link.parent.parent.name == 'td' and link.parent.parent.get('class')[0] == 'gensmall':
- next_page = topic.split('?')[0] + '?' + link['href'].split('?')[1]
- for td in parsed_page.body.find_all('td',class_="gensmall"):
- for child in td.children:
- if child.name == 'div' and child.has_attr('style') and child['style'] == 'float: right;':
- posts.append(child.text.replace('Posted: ',''))
- except:
- print "Skipped due to error in",topic
- next_page = ''
- print 'Got',len(posts),'posts'
- if len(sys.argv) == 3:
- output = open(sys.argv[2],'w')
- for post in posts:
- output.write("%s\n" % post)
- for post in posts:
- post = post.split(' ')
- if not stats.has_key(post[3]):
- stats[post[3]] = 0
- stats[post[3]] += 1
- print "Results as tsv:"
- for key in sorted(stats.iterkeys()):
- print "%s\t%s" % (key,stats[key])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement