Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- '''
- Created on May 8, 2012
- @author: Nisheeth
- '''
- import urllib2
- import re
- import operator
- from BeautifulSoup import BeautifulSoup
- from os import path
- from datetime import datetime, timedelta
- def render_page(root_url):
- '''
- @param root_url: URL of main thread
- '''
- page_index = 0
- page_count = 1
- total_votes = 0;
- vote_count = {}
- k = 0;
- while page_index < page_count:
- url = root_url[:-5]
- if page_index > 0:
- url += '-'+str(page_index+1)
- url += '.html'
- #print url
- response = urllib2.urlopen(url)
- page = response.read()
- response.close()
- parsed_page = BeautifulSoup(page)
- pat = re.compile("^post_message_.*")
- page_count_txt = ''.join(parsed_page.find('div', attrs={'class': 'pagenav'}).find('td', attrs={'class': 'vbmenu_control'}).findAll(text=True));
- page_count = int(page_count_txt[len(page_count_txt)-page_count_txt[::-1].index(' '):])
- result=parsed_page.findAll('div', attrs={'id': pat})
- first_post = False
- if page_index == 0:
- result = result[1:]
- else:
- first_post = True
- page_index += 1
- for r in result:
- i = 2;
- if first_post: # fix for ads in first post
- t = re.compile(r'.*<!-- END TEMPLATE: ad_showthread_firstpost_start ', re.S)
- inner_text = re.sub(t,' ',''.join(r.findAll(text=True)))
- first_post = False
- else:
- inner_text = ''.join(r.findAll(text=True))
- #print inner_text
- for u in re.sub(r'[\t\r\n]+',r'\n', inner_text.strip()).split('\n'):
- score = 2**i
- i -=1
- if not vote_count.has_key(u.lower()):
- vote_count[u.lower()] = []
- vote_count[u.lower()].append(score)
- vote_count[u.lower()].append(1)
- else:
- vote_count[u.lower()][0] += score
- vote_count[u.lower()][1] += 1
- total_votes += 1
- table_html="";
- #print vote_count
- vote_count = sorted(vote_count.items(), key=operator.itemgetter(0), reverse=True)
- #print vote_count
- for k,v in vote_count:
- table_html += ("<tr><td>%s</td><td>%s</td><td>%s</td></tr>" % (k, v[1], v[0]))
- return """
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- <title>ThinkDigit Workspace Showoff Contest Scores</title>
- </head>
- <body>
- <div style="font-style:italic">Updates every 10 minutes</div>
- <table width="400px" border="" style="border-style:solid;border-width:1px;border-collapse:collapse">
- <thead><tr><th>User</th><th>Votes</th><th>Score</th></tr></thead>
- <tbody>%s</tbody>
- </table>
- <div>Total Votes: <b>%s</b></div>
- </body>
- </html>
- """ % (table_html, total_votes)
- root_url = 'http://www.thinkdigit.com/forum/chit-chat/157002-contest-voting-thread.html';
- cache_filename = 'vote.cache'
- cache_file = None
- output = ''
- if path.isfile(cache_filename):
- if (datetime.now() - datetime.fromtimestamp(path.getmtime(cache_filename))) > timedelta (minutes = 10):
- with open(cache_filename, "w") as f:
- output = render_page(root_url)
- f.write(output)
- else:
- with open(cache_filename, "r") as f:
- output = f.read()
- else:
- with open(cache_filename, "w") as f:
- output = render_page(root_url)
- f.write(output)
- print output
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement