Advertisement
Goldkin

organize_reader.py

Jan 22nd, 2012
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.52 KB | None | 0 0
  1. #!/usr/bin/python
  2. """
  3. Google Reader Topic Organizer
  4.  
  5. A simple proof of concept that organizes English RSS feeds aggregated by a
  6. Google Reader account, spitting out a list containing the most relevant topics
  7. and articles for easy organization and rapid, idiomatic consumption. Ideally,
  8. this script should be used as guidance for a similar feature in new RSS
  9. readers everywhere.
  10.  
  11. Based on a script from here:
  12. http://blog.yjl.im/2010/08/using-python-to-get-google-reader.html
  13.  
  14.  
  15. This script REQUIRES an additional file named "reader.cfg" in the
  16. current working directory to function correctly. The contents of this
  17. file should appear as follows:
  18.  
  19. [Main]
  20. username: example@gmail.com
  21. password: mypass;thisunfortunatelymustbeinplaintext
  22.  
  23. ... where "username" is your Google Login address for Google Reader,
  24. and "password" is your Google password. Please note that credentials
  25. ARE STORED AND KEPT INSECURELY, INCLUDING IN RAM. Take the
  26. appropriate precautions or, ideally, roll this into a better security
  27. scheme instead.
  28.  
  29.  
  30. This code is covered by a Creative Commons Attribution License, 3.0:
  31. http://creativecommons.org/licenses/by/3.0/
  32.  
  33. Credit back to Goldkin Drake will do.
  34. The original source this is based on is (c) 2008-2012 Yu-Jie Lin.
  35.  
  36.  
  37. TODOs:
  38.  
  39. * Reflow URL-encoded characters as their correct representations
  40.  
  41. More TODOs and notes inline, below.
  42.  
  43.  
  44. Initial release: January 22nd, 2012
  45.  
  46. """
  47.  
  48. __author__ = "Goldkin Drake"
  49.  
  50. ## Imports
  51.  
  52. import json                             # To parse JSON input/output
  53. import operator                         # For itemgetter
  54. import string                           # For string operations
  55. import urllib                           # For URL and OSI Level 7 support
  56. import urllib2                          # Ditto
  57. from xml.dom import minidom             # For XML parsing
  58. from xml.dom import EMPTY_NAMESPACE     # Ditto (yes, a comma would do)
  59.  
  60. from ConfigParser import ConfigParser   # To parse our configuration file
  61.  
  62.  
  63. ## Constants
  64.  
  65. COUNT=1000              # The maximum number of articles to consider from Reader
  66. BONUS_MODIFIER=1.0      # The score bonus applied for word and phrase
  67.                         #  complexity. This increases linearly with the length
  68.                         #  of a word or phrase being considered for indexing.
  69. PENALTY_MODIFIER=0.50   # The score penalty applied to common words found in
  70.                         #  multiple article summaries. This becomes linearly
  71.                         #  significant as the number of articles approaches
  72.                         #  infinity. Tweak this to see sharper cutoffs.
  73. WORD_BLACKLIST=("with", "from", "don39t")
  74.                         # Additional word blacklist for common phrase
  75.                         #  transitions. This list is case-insensitive.
  76.  
  77. ## Functions
  78.  
  79. def isLowercase(word):
  80.     ''' Determine if a word only contains lowercase letters '''
  81.     if word == None or word == '':
  82.         return False
  83.  
  84.     return word == word.lower()
  85.  
  86. def isUppercase(word):
  87.     ''' Determine if a word only contains uppercase letters '''
  88.     if word == None or word == '':
  89.         return False
  90.  
  91.     return word == word.upper()
  92.  
  93. def normalize(line):
  94.     ''' Normalize a line of text by reformating all forms of whitespace and
  95.     removing punctuation '''
  96.     for i in string.whitespace:
  97.         line = line.replace(i,' ')
  98.     for i in string.punctuation:
  99.         line = line.replace(i,'')
  100.     return line.encode('ascii','ignore').lower()
  101.  
  102.  
  103. def isTransition(word):
  104.     ''' Identify phrase fragments; these aren't necessarily sentence breaks '''
  105.     return (len(word) < 4 and not isUppercase(word)) or \
  106.       word.lower() in WORD_BLACKLIST
  107.  
  108.  
  109. def main():
  110.     config = ConfigParser()
  111.     config.read('reader.cfg')
  112.  
  113.     # Authenticate to obtain Auth
  114.     auth_url = 'https://www.google.com/accounts/ClientLogin'
  115.     auth_req_data = urllib.urlencode({
  116.         'Email': config.get('Main','username'),
  117.         'Passwd': config.get('Main','password'),
  118.         'service': 'reader'
  119.         })
  120.     auth_req = urllib2.Request(auth_url, data=auth_req_data)
  121.     auth_resp = urllib2.urlopen(auth_req)
  122.     auth_resp_content = auth_resp.read()
  123.     auth_resp_dict = dict(x.split('=') for \
  124.          x in auth_resp_content.split('\n') if x)
  125.     AUTH = auth_resp_dict["Auth"]
  126.  
  127.     # Create a cookie in the header using the Auth
  128.     header = {'Authorization': 'GoogleLogin auth=%s' % AUTH}
  129.  
  130.     reader_base_url = 'http://www.google.com/reader/api/0/unread-count?%s'
  131.     reader_req_data = urllib.urlencode({ 'all': 'true', 'output': 'json'})
  132.  
  133.     reader_url = reader_base_url % (reader_req_data)
  134.     reader_req = urllib2.Request(reader_url, None, header)
  135.     reader_resp = urllib2.urlopen(reader_req)
  136.     j = json.load(reader_resp)
  137.     count = ([c['count'] for c in j['unreadcounts'] if \
  138.       c['id'].endswith('/state/com.google/reading-list')] or [0])[0]
  139.  
  140.     if count:
  141.         print 'Unread: %d' % count
  142.     else:
  143.         print 'No unread items.'
  144.        
  145.     wordtitles = {}
  146.     wordcounts = {}
  147.     summarywordcounts = {}
  148.     phrases = []
  149.     titles = set()
  150.     urllookup = {}
  151.  
  152.     # If we have articles, begin processing through them
  153.     if count:
  154.         ATOM_NS = 'http://www.w3.org/2005/Atom'
  155.    
  156.         reader_base_url = \
  157.           r'http://www.google.com/reader/atom/user%2F-%2Fstate%2F' + \
  158.           'com.google%2freading-list?n=' + str(COUNT)
  159.  
  160.         reader_url = reader_base_url
  161.         reader_req = urllib2.Request(reader_url, None, header)
  162.         reader_resp = urllib2.urlopen(reader_req)
  163.         doc = minidom.parse(reader_resp)
  164.         doc.normalize()
  165.    
  166.         for entry in doc.getElementsByTagNameNS(ATOM_NS, u'entry'):
  167.             # Verify that we have sufficient information to continue
  168.             if [True for cat in \
  169.               entry.getElementsByTagNameNS(ATOM_NS, u'category') if \
  170.               cat.getAttributeNS(EMPTY_NAMESPACE, u'term').endswith( \
  171.               '/state/com.google/read')]:
  172.                 continue
  173.  
  174.             # Get the article title and summary content
  175.             title = entry.getElementsByTagNameNS( \
  176.               ATOM_NS, u'title')[0].firstChild.data
  177.  
  178.             summary = ''
  179.             url = entry.getElementsByTagNameNS( \
  180.               ATOM_NS, u'link')[0].getAttribute('href')
  181.  
  182.             # Use the summary to downvote title information as non-unique and
  183.             #  non-interesting. This allows us to separate unique, interesting
  184.             #  keywords in our titles from generic cruft spread over most articles.
  185.             try:
  186.                 summary = entry.getElementsByTagNameNS( \
  187.                   ATOM_NS, u'summary')[0].firstChild.data
  188.                 summary = normalize(summary)
  189.  
  190.                 for word in summary.split(' '):
  191.                     if word not in wordcounts:
  192.                         summarywordcounts[word] = 1
  193.                     else:
  194.                         summarywordcounts[word] += 1
  195.             except:
  196.                 try:
  197.                     summary = entry.getElementsByTagNameNS( \
  198.                       ATOM_NS, u'content')[0].firstChild.data
  199.                 except:
  200.                     pass
  201.  
  202.             # Normalize the title and add it to our titles list
  203.             title = normalize(title)
  204.             titles.add(title)
  205.             urllookup[title] = url
  206.  
  207.             # Split the title out into its component phrases. In the English
  208.             #  language, we consider short transitions (less than five letters
  209.             #  that aren't an acronym) to be a phrase break, due to the common
  210.             #  transitions "a", "the", "and".
  211.             phrase = ''
  212.             for word in title.split(' '):
  213.                 if word == '':
  214.                     continue
  215.  
  216.                 if word not in wordtitles:
  217.                     wordtitles[word] = []
  218.                 wordtitles[word].append(title)
  219.  
  220.                 # TODO: Better identification of transitions
  221.                 if (isTransition(word)):
  222.                     if phrase != '':
  223.  
  224.                         for word in phrase.split(' '):
  225.            
  226.                             if word not in wordcounts:
  227.                                 wordcounts[word] = 1
  228.                             else:
  229.                                 wordcounts[word] += 1
  230.                         phrases.append(phrase)
  231.                         phrase = ''
  232.                    
  233.                     continue
  234.  
  235.                 if phrase == '':
  236.                     phrase = word
  237.                 else:
  238.                     phrase += ' '
  239.                     phrase += word
  240.  
  241.             # Also count the frequency of each word
  242.             if phrase != '':
  243.                 for word in phrase.split(' '):
  244.    
  245.                     if word not in wordcounts:
  246.                         wordcounts[word] = 1
  247.                     else:
  248.                         wordcounts[word] += 1
  249.                 phrases.append(phrase)
  250.  
  251.     # Grab phrases and count their frequency. Phrase counts in titles will be
  252.     #  used as upvotes, flatly modified by the length of each word in the
  253.     #  phrase. The more complex and common the phrase, the higher it appears in
  254.     #  our scored list.
  255.     phrasecounts = {}
  256.     for phrase in phrases:
  257.         score = 0.0
  258.  
  259.         wordsSoFar = []
  260.         for word in phrase.split(' '):
  261.             if word not in wordsSoFar:
  262.                 wordsSoFar.append(word)
  263.                 score += wordcounts[word] * len(word) * BONUS_MODIFIER
  264.                 if word in summarywordcounts:
  265.                     score -= summarywordcounts[word] * len(word) * \
  266.                       PENALTY_MODIFIER
  267.  
  268.         phrasecounts[phrase] = score / len(wordsSoFar)
  269.  
  270.     for phrase in sorted(phrasecounts.iteritems(), \
  271.       key=operator.itemgetter(1), reverse=True):
  272.         articles = []
  273.         for num, word in enumerate(phrase[0].split(' ')):
  274.             if num == 0:
  275.                 articles = wordtitles[word]
  276.             else:
  277.                 for article in articles:
  278.                     if word not in article:
  279.                         articles = articles.remove(article)
  280.                     if not articles:
  281.                         articles = []
  282.                         break
  283.                
  284.  
  285.         if articles:
  286.             articles = titles.intersection(articles)    
  287.             articlecounts = {}
  288.             for article in articles:
  289.                 #print phrase,": ",article
  290.                 score = 0.0
  291.  
  292.                 wordSoFar = []
  293.                 for word in article.split(' '):
  294.                     if isTransition(word):
  295.                         continue
  296.                     if word not in wordsSoFar:
  297.                        
  298.                         wordsSoFar.append(word)
  299.                         score += wordcounts[word] * len(word) * BONUS_MODIFIER
  300.                         if word in summarywordcounts:
  301.                             score -= summarywordcounts[word] * len(word) * \
  302.                               PENALTY_MODIFIER
  303.  
  304.  
  305.                 articlecounts[article] = score / len(wordsSoFar)
  306.  
  307.             #print articlecounts
  308.             for article in sorted(articlecounts.iteritems(), \
  309.               key=operator.itemgetter(1), reverse=True):
  310.  
  311.                 print phrase, ": ", article[0],urllookup[article[0]]
  312.                 titles.remove(article[0])
  313.            
  314.  
  315. ## Main entry point
  316.  
  317. # Only execute this script's main on direct invocation. Otherwise, assume that
  318. #  the user would prefer it as a library and can handle running the code
  319. #  themselves.
  320. if __name__ == '__main__':
  321.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement