#!/usr/bin/env python ''' Created on May 8, 2012 @author: Nisheeth ''' import urllib2 import re import operator from BeautifulSoup import BeautifulSoup from os import path from datetime import datetime, timedelta def render_page(root_url): ''' @param root_url: URL of main thread ''' page_index = 0 page_count = 1 total_votes = 0; vote_count = {} k = 0; while page_index < page_count: url = root_url[:-5] if page_index > 0: url += '-'+str(page_index+1) url += '.html' #print url response = urllib2.urlopen(url) page = response.read() response.close() parsed_page = BeautifulSoup(page) pat = re.compile("^post_message_.*") page_count_txt = ''.join(parsed_page.find('div', attrs={'class': 'pagenav'}).find('td', attrs={'class': 'vbmenu_control'}).findAll(text=True)); page_count = int(page_count_txt[len(page_count_txt)-page_count_txt[::-1].index(' '):]) result=parsed_page.findAll('div', attrs={'id': pat}) first_post = False if page_index == 0: result = result[1:] else: first_post = True page_index += 1 for r in result: i = 2; if first_post: # fix for ads in first post t = re.compile(r'.*