organize_reader.py

#!/usr/bin/python
"""
Google Reader Topic Organizer

 A simple proof of concept that organizes English RSS feeds aggregated by a
 Google Reader account, spitting out a list containing the most relevant topics
 and articles for easy organization and rapid, idiomatic consumption. Ideally,
 this script should be used as guidance for a similar feature in new RSS
 readers everywhere.

 Based on a script from here:
 http://blog.yjl.im/2010/08/using-python-to-get-google-reader.html


 This script REQUIRES an additional file named "reader.cfg" in the
 current working directory to function correctly. The contents of this
 file should appear as follows:

 [Main]
 username: example@gmail.com
 password: mypass;thisunfortunatelymustbeinplaintext

 ... where "username" is your Google Login address for Google Reader,
 and "password" is your Google password. Please note that credentials
 ARE STORED AND KEPT INSECURELY, INCLUDING IN RAM. Take the
 appropriate precautions or, ideally, roll this into a better security
 scheme instead.


 This code is covered by a Creative Commons Attribution License, 3.0:
 http://creativecommons.org/licenses/by/3.0/

 Credit back to Goldkin Drake will do.
 The original source this is based on is (c) 2008-2012 Yu-Jie Lin.


 TODOs:

 * Reflow URL-encoded characters as their correct representations

 More TODOs and notes inline, below.


 Initial release: January 22nd, 2012

"""

__author__ = "Goldkin Drake"

## Imports

import json                             # To parse JSON input/output
import operator                         # For itemgetter
import string                           # For string operations
import urllib                           # For URL and OSI Level 7 support
import urllib2                          # Ditto
from xml.dom import minidom             # For XML parsing
from xml.dom import EMPTY_NAMESPACE     # Ditto (yes, a comma would do)

from ConfigParser import ConfigParser   # To parse our configuration file


## Constants

COUNT=1000              # The maximum number of articles to consider from Reader
BONUS_MODIFIER=1.0      # The score bonus applied for word and phrase
                        #  complexity. This increases linearly with the length
                        #  of a word or phrase being considered for indexing.
PENALTY_MODIFIER=0.50   # The score penalty applied to common words found in
                        #  multiple article summaries. This becomes linearly
                        #  significant as the number of articles approaches
                        #  infinity. Tweak this to see sharper cutoffs.
WORD_BLACKLIST=("with", "from", "don39t")
                        # Additional word blacklist for common phrase
                        #  transitions. This list is case-insensitive.

## Functions

def isLowercase(word):
    ''' Determine if a word only contains lowercase letters '''
    if word == None or word == '':
        return False

    return word == word.lower()

def isUppercase(word):
    ''' Determine if a word only contains uppercase letters '''
    if word == None or word == '':
        return False

    return word == word.upper()

def normalize(line):
    ''' Normalize a line of text by reformating all forms of whitespace and
     removing punctuation '''
    for i in string.whitespace:
        line = line.replace(i,' ')
    for i in string.punctuation:
        line = line.replace(i,'')
    return line.encode('ascii','ignore').lower()


def isTransition(word):
    ''' Identify phrase fragments; these aren't necessarily sentence breaks '''
    return (len(word) < 4 and not isUppercase(word)) or \
      word.lower() in WORD_BLACKLIST


def main():
    config = ConfigParser()
    config.read('reader.cfg')

    # Authenticate to obtain Auth
    auth_url = 'https://www.google.com/accounts/ClientLogin'
    auth_req_data = urllib.urlencode({
        'Email': config.get('Main','username'),
        'Passwd': config.get('Main','password'),
        'service': 'reader'
        })
    auth_req = urllib2.Request(auth_url, data=auth_req_data)
    auth_resp = urllib2.urlopen(auth_req)
    auth_resp_content = auth_resp.read()
    auth_resp_dict = dict(x.split('=') for \
         x in auth_resp_content.split('\n') if x)
    AUTH = auth_resp_dict["Auth"]

    # Create a cookie in the header using the Auth
    header = {'Authorization': 'GoogleLogin auth=%s' % AUTH}

    reader_base_url = 'http://www.google.com/reader/api/0/unread-count?%s'
    reader_req_data = urllib.urlencode({ 'all': 'true', 'output': 'json'})

    reader_url = reader_base_url % (reader_req_data)
    reader_req = urllib2.Request(reader_url, None, header)
    reader_resp = urllib2.urlopen(reader_req)
    j = json.load(reader_resp)
    count = ([c['count'] for c in j['unreadcounts'] if \
      c['id'].endswith('/state/com.google/reading-list')] or [0])[0]

    if count:
        print 'Unread: %d' % count
    else:
        print 'No unread items.'

    wordtitles = {}
    wordcounts = {}
    summarywordcounts = {}
    phrases = []
    titles = set()
    urllookup = {}

    # If we have articles, begin processing through them
    if count:
        ATOM_NS = 'http://www.w3.org/2005/Atom'

        reader_base_url = \
          r'http://www.google.com/reader/atom/user%2F-%2Fstate%2F' + \
          'com.google%2freading-list?n=' + str(COUNT)

        reader_url = reader_base_url
        reader_req = urllib2.Request(reader_url, None, header)
        reader_resp = urllib2.urlopen(reader_req)
        doc = minidom.parse(reader_resp)
        doc.normalize()

        for entry in doc.getElementsByTagNameNS(ATOM_NS, u'entry'):
            # Verify that we have sufficient information to continue
            if [True for cat in \
              entry.getElementsByTagNameNS(ATOM_NS, u'category') if \
              cat.getAttributeNS(EMPTY_NAMESPACE, u'term').endswith( \
              '/state/com.google/read')]:
                continue

            # Get the article title and summary content
            title = entry.getElementsByTagNameNS( \
              ATOM_NS, u'title')[0].firstChild.data

            summary = ''
            url = entry.getElementsByTagNameNS( \
              ATOM_NS, u'link')[0].getAttribute('href')

            # Use the summary to downvote title information as non-unique and
            #  non-interesting. This allows us to separate unique, interesting
            #  keywords in our titles from generic cruft spread over most articles.
            try:
                summary = entry.getElementsByTagNameNS( \
                  ATOM_NS, u'summary')[0].firstChild.data
                summary = normalize(summary)

                for word in summary.split(' '):
                    if word not in wordcounts:
                        summarywordcounts[word] = 1
                    else:
                        summarywordcounts[word] += 1
            except:
                try:
                    summary = entry.getElementsByTagNameNS( \
                      ATOM_NS, u'content')[0].firstChild.data
                except:
                    pass

            # Normalize the title and add it to our titles list
            title = normalize(title)
            titles.add(title)
            urllookup[title] = url

            # Split the title out into its component phrases. In the English
            #  language, we consider short transitions (less than five letters
            #  that aren't an acronym) to be a phrase break, due to the common
            #  transitions "a", "the", "and".
            phrase = ''
            for word in title.split(' '):
                if word == '':
                    continue

                if word not in wordtitles:
                    wordtitles[word] = []
                wordtitles[word].append(title)

                # TODO: Better identification of transitions
                if (isTransition(word)):
                    if phrase != '':

                        for word in phrase.split(' '):

                            if word not in wordcounts:
                                wordcounts[word] = 1
                            else:
                                wordcounts[word] += 1
                        phrases.append(phrase)
                        phrase = ''

                    continue

                if phrase == '':
                    phrase = word
                else:
                    phrase += ' '
                    phrase += word

            # Also count the frequency of each word
            if phrase != '':
                for word in phrase.split(' '):

                    if word not in wordcounts:
                        wordcounts[word] = 1
                    else:
                        wordcounts[word] += 1
                phrases.append(phrase)

    # Grab phrases and count their frequency. Phrase counts in titles will be
    #  used as upvotes, flatly modified by the length of each word in the
    #  phrase. The more complex and common the phrase, the higher it appears in
    #  our scored list.
    phrasecounts = {}
    for phrase in phrases:
        score = 0.0

        wordsSoFar = []
        for word in phrase.split(' '):
            if word not in wordsSoFar:
                wordsSoFar.append(word)
                score += wordcounts[word] * len(word) * BONUS_MODIFIER
                if word in summarywordcounts:
                    score -= summarywordcounts[word] * len(word) * \
                      PENALTY_MODIFIER

        phrasecounts[phrase] = score / len(wordsSoFar)

    for phrase in sorted(phrasecounts.iteritems(), \
      key=operator.itemgetter(1), reverse=True):
        articles = []
        for num, word in enumerate(phrase[0].split(' ')):
            if num == 0:
                articles = wordtitles[word]
            else:
                for article in articles:
                    if word not in article:
                        articles = articles.remove(article)
                    if not articles:
                        articles = []
                        break


        if articles:
            articles = titles.intersection(articles)
            articlecounts = {}
            for article in articles:
                #print phrase,": ",article
                score = 0.0

                wordSoFar = []
                for word in article.split(' '):
                    if isTransition(word):
                        continue
                    if word not in wordsSoFar:

                        wordsSoFar.append(word)
                        score += wordcounts[word] * len(word) * BONUS_MODIFIER
                        if word in summarywordcounts:
                            score -= summarywordcounts[word] * len(word) * \
                              PENALTY_MODIFIER


                articlecounts[article] = score / len(wordsSoFar)

            #print articlecounts
            for article in sorted(articlecounts.iteritems(), \
              key=operator.itemgetter(1), reverse=True):

                print phrase, ": ", article[0],urllookup[article[0]]
                titles.remove(article[0])


## Main entry point

# Only execute this script's main on direct invocation. Otherwise, assume that
#  the user would prefer it as a library and can handle running the code
#  themselves.
if __name__ == '__main__':
    main()