Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- """
- Google Reader Topic Organizer
- A simple proof of concept that organizes English RSS feeds aggregated by a
- Google Reader account, spitting out a list containing the most relevant topics
- and articles for easy organization and rapid, idiomatic consumption. Ideally,
- this script should be used as guidance for a similar feature in new RSS
- readers everywhere.
- Based on a script from here:
- http://blog.yjl.im/2010/08/using-python-to-get-google-reader.html
- This script REQUIRES an additional file named "reader.cfg" in the
- current working directory to function correctly. The contents of this
- file should appear as follows:
- [Main]
- username: example@gmail.com
- password: mypass;thisunfortunatelymustbeinplaintext
- ... where "username" is your Google Login address for Google Reader,
- and "password" is your Google password. Please note that credentials
- ARE STORED AND KEPT INSECURELY, INCLUDING IN RAM. Take the
- appropriate precautions or, ideally, roll this into a better security
- scheme instead.
- This code is covered by a Creative Commons Attribution License, 3.0:
- http://creativecommons.org/licenses/by/3.0/
- Credit back to Goldkin Drake will do.
- The original source this is based on is (c) 2008-2012 Yu-Jie Lin.
- TODOs:
- * Reflow URL-encoded characters as their correct representations
- More TODOs and notes inline, below.
- Initial release: January 22nd, 2012
- """
- __author__ = "Goldkin Drake"
- ## Imports
- import json # To parse JSON input/output
- import operator # For itemgetter
- import string # For string operations
- import urllib # For URL and OSI Level 7 support
- import urllib2 # Ditto
- from xml.dom import minidom # For XML parsing
- from xml.dom import EMPTY_NAMESPACE # Ditto (yes, a comma would do)
- from ConfigParser import ConfigParser # To parse our configuration file
- ## Constants
- COUNT=1000 # The maximum number of articles to consider from Reader
- BONUS_MODIFIER=1.0 # The score bonus applied for word and phrase
- # complexity. This increases linearly with the length
- # of a word or phrase being considered for indexing.
- PENALTY_MODIFIER=0.50 # The score penalty applied to common words found in
- # multiple article summaries. This becomes linearly
- # significant as the number of articles approaches
- # infinity. Tweak this to see sharper cutoffs.
- WORD_BLACKLIST=("with", "from", "don39t")
- # Additional word blacklist for common phrase
- # transitions. This list is case-insensitive.
- ## Functions
- def isLowercase(word):
- ''' Determine if a word only contains lowercase letters '''
- if word == None or word == '':
- return False
- return word == word.lower()
- def isUppercase(word):
- ''' Determine if a word only contains uppercase letters '''
- if word == None or word == '':
- return False
- return word == word.upper()
- def normalize(line):
- ''' Normalize a line of text by reformating all forms of whitespace and
- removing punctuation '''
- for i in string.whitespace:
- line = line.replace(i,' ')
- for i in string.punctuation:
- line = line.replace(i,'')
- return line.encode('ascii','ignore').lower()
- def isTransition(word):
- ''' Identify phrase fragments; these aren't necessarily sentence breaks '''
- return (len(word) < 4 and not isUppercase(word)) or \
- word.lower() in WORD_BLACKLIST
- def main():
- config = ConfigParser()
- config.read('reader.cfg')
- # Authenticate to obtain Auth
- auth_url = 'https://www.google.com/accounts/ClientLogin'
- auth_req_data = urllib.urlencode({
- 'Email': config.get('Main','username'),
- 'Passwd': config.get('Main','password'),
- 'service': 'reader'
- })
- auth_req = urllib2.Request(auth_url, data=auth_req_data)
- auth_resp = urllib2.urlopen(auth_req)
- auth_resp_content = auth_resp.read()
- auth_resp_dict = dict(x.split('=') for \
- x in auth_resp_content.split('\n') if x)
- AUTH = auth_resp_dict["Auth"]
- # Create a cookie in the header using the Auth
- header = {'Authorization': 'GoogleLogin auth=%s' % AUTH}
- reader_base_url = 'http://www.google.com/reader/api/0/unread-count?%s'
- reader_req_data = urllib.urlencode({ 'all': 'true', 'output': 'json'})
- reader_url = reader_base_url % (reader_req_data)
- reader_req = urllib2.Request(reader_url, None, header)
- reader_resp = urllib2.urlopen(reader_req)
- j = json.load(reader_resp)
- count = ([c['count'] for c in j['unreadcounts'] if \
- c['id'].endswith('/state/com.google/reading-list')] or [0])[0]
- if count:
- print 'Unread: %d' % count
- else:
- print 'No unread items.'
- wordtitles = {}
- wordcounts = {}
- summarywordcounts = {}
- phrases = []
- titles = set()
- urllookup = {}
- # If we have articles, begin processing through them
- if count:
- ATOM_NS = 'http://www.w3.org/2005/Atom'
- reader_base_url = \
- r'http://www.google.com/reader/atom/user%2F-%2Fstate%2F' + \
- 'com.google%2freading-list?n=' + str(COUNT)
- reader_url = reader_base_url
- reader_req = urllib2.Request(reader_url, None, header)
- reader_resp = urllib2.urlopen(reader_req)
- doc = minidom.parse(reader_resp)
- doc.normalize()
- for entry in doc.getElementsByTagNameNS(ATOM_NS, u'entry'):
- # Verify that we have sufficient information to continue
- if [True for cat in \
- entry.getElementsByTagNameNS(ATOM_NS, u'category') if \
- cat.getAttributeNS(EMPTY_NAMESPACE, u'term').endswith( \
- '/state/com.google/read')]:
- continue
- # Get the article title and summary content
- title = entry.getElementsByTagNameNS( \
- ATOM_NS, u'title')[0].firstChild.data
- summary = ''
- url = entry.getElementsByTagNameNS( \
- ATOM_NS, u'link')[0].getAttribute('href')
- # Use the summary to downvote title information as non-unique and
- # non-interesting. This allows us to separate unique, interesting
- # keywords in our titles from generic cruft spread over most articles.
- try:
- summary = entry.getElementsByTagNameNS( \
- ATOM_NS, u'summary')[0].firstChild.data
- summary = normalize(summary)
- for word in summary.split(' '):
- if word not in wordcounts:
- summarywordcounts[word] = 1
- else:
- summarywordcounts[word] += 1
- except:
- try:
- summary = entry.getElementsByTagNameNS( \
- ATOM_NS, u'content')[0].firstChild.data
- except:
- pass
- # Normalize the title and add it to our titles list
- title = normalize(title)
- titles.add(title)
- urllookup[title] = url
- # Split the title out into its component phrases. In the English
- # language, we consider short transitions (less than five letters
- # that aren't an acronym) to be a phrase break, due to the common
- # transitions "a", "the", "and".
- phrase = ''
- for word in title.split(' '):
- if word == '':
- continue
- if word not in wordtitles:
- wordtitles[word] = []
- wordtitles[word].append(title)
- # TODO: Better identification of transitions
- if (isTransition(word)):
- if phrase != '':
- for word in phrase.split(' '):
- if word not in wordcounts:
- wordcounts[word] = 1
- else:
- wordcounts[word] += 1
- phrases.append(phrase)
- phrase = ''
- continue
- if phrase == '':
- phrase = word
- else:
- phrase += ' '
- phrase += word
- # Also count the frequency of each word
- if phrase != '':
- for word in phrase.split(' '):
- if word not in wordcounts:
- wordcounts[word] = 1
- else:
- wordcounts[word] += 1
- phrases.append(phrase)
- # Grab phrases and count their frequency. Phrase counts in titles will be
- # used as upvotes, flatly modified by the length of each word in the
- # phrase. The more complex and common the phrase, the higher it appears in
- # our scored list.
- phrasecounts = {}
- for phrase in phrases:
- score = 0.0
- wordsSoFar = []
- for word in phrase.split(' '):
- if word not in wordsSoFar:
- wordsSoFar.append(word)
- score += wordcounts[word] * len(word) * BONUS_MODIFIER
- if word in summarywordcounts:
- score -= summarywordcounts[word] * len(word) * \
- PENALTY_MODIFIER
- phrasecounts[phrase] = score / len(wordsSoFar)
- for phrase in sorted(phrasecounts.iteritems(), \
- key=operator.itemgetter(1), reverse=True):
- articles = []
- for num, word in enumerate(phrase[0].split(' ')):
- if num == 0:
- articles = wordtitles[word]
- else:
- for article in articles:
- if word not in article:
- articles = articles.remove(article)
- if not articles:
- articles = []
- break
- if articles:
- articles = titles.intersection(articles)
- articlecounts = {}
- for article in articles:
- #print phrase,": ",article
- score = 0.0
- wordSoFar = []
- for word in article.split(' '):
- if isTransition(word):
- continue
- if word not in wordsSoFar:
- wordsSoFar.append(word)
- score += wordcounts[word] * len(word) * BONUS_MODIFIER
- if word in summarywordcounts:
- score -= summarywordcounts[word] * len(word) * \
- PENALTY_MODIFIER
- articlecounts[article] = score / len(wordsSoFar)
- #print articlecounts
- for article in sorted(articlecounts.iteritems(), \
- key=operator.itemgetter(1), reverse=True):
- print phrase, ": ", article[0],urllookup[article[0]]
- titles.remove(article[0])
- ## Main entry point
- # Only execute this script's main on direct invocation. Otherwise, assume that
- # the user would prefer it as a library and can handle running the code
- # themselves.
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement