Pastebin.com

#!/usr/bin/env python
'''
Created on May 8, 2012
@author: Nisheeth
'''
import urllib2
import re
import operator
from BeautifulSoup import BeautifulSoup
from os import path
from datetime import datetime, timedelta

def render_page(root_url):
    '''
        @param root_url: URL of main thread
    '''
    page_index = 0
    page_count = 1
    total_votes = 0;
    vote_count = {}
    k = 0;
    while page_index < page_count:
        url =  root_url[:-5]
        if page_index > 0:
            url += '-'+str(page_index+1)
        url += '.html'

        #print url
        response = urllib2.urlopen(url)
        page = response.read()
        response.close()
        parsed_page = BeautifulSoup(page)
        pat = re.compile("^post_message_.*")
        page_count_txt = ''.join(parsed_page.find('div', attrs={'class': 'pagenav'}).find('td', attrs={'class': 'vbmenu_control'}).findAll(text=True));
        page_count = int(page_count_txt[len(page_count_txt)-page_count_txt[::-1].index(' '):])

        result=parsed_page.findAll('div', attrs={'id': pat})
        first_post = False
        if page_index == 0:
            result = result[1:]
        else:
            first_post = True

        page_index += 1
        for r in result:
            i = 2;
            if first_post: # fix for ads in first post
                t = re.compile(r'.*<!-- END TEMPLATE: ad_showthread_firstpost_start ', re.S)
                inner_text = re.sub(t,' ',''.join(r.findAll(text=True)))
                first_post = False
            else:
                inner_text = ''.join(r.findAll(text=True))
            #print inner_text
            for u in re.sub(r'[\t\r\n]+',r'\n', inner_text.strip()).split('\n'):
                score = 2**i
                i -=1
                if not vote_count.has_key(u.lower()):
                    vote_count[u.lower()] = []
                    vote_count[u.lower()].append(score)
                    vote_count[u.lower()].append(1)
                else:
                    vote_count[u.lower()][0] += score
                    vote_count[u.lower()][1] += 1
                total_votes += 1
    table_html="";
    #print vote_count
    vote_count = sorted(vote_count.items(), key=operator.itemgetter(0), reverse=True)
    #print vote_count
    for k,v in vote_count:
        table_html += ("<tr><td>%s</td><td>%s</td><td>%s</td></tr>" % (k, v[1], v[0]))

    return """
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>ThinkDigit Workspace Showoff Contest Scores</title>
    </head>
    <body>
    <div style="font-style:italic">Updates every 10 minutes</div>
    <table width="400px" border="" style="border-style:solid;border-width:1px;border-collapse:collapse">
    <thead><tr><th>User</th><th>Votes</th><th>Score</th></tr></thead>
    <tbody>%s</tbody>
    </table>
    <div>Total Votes: <b>%s</b></div>
    </body>
    </html>
    """ % (table_html, total_votes)


root_url = 'http://www.thinkdigit.com/forum/chit-chat/157002-contest-voting-thread.html';
cache_filename = 'vote.cache'
cache_file = None
output = ''

if path.isfile(cache_filename):
    if (datetime.now() - datetime.fromtimestamp(path.getmtime(cache_filename)))  > timedelta (minutes = 10):
        with open(cache_filename, "w") as f:
            output = render_page(root_url)
            f.write(output)
    else:
        with open(cache_filename, "r") as f:
            output = f.read()
else:
    with open(cache_filename, "w") as f:
                output = render_page(root_url)
                f.write(output)

print output