citeusyncv3.py

#!/usr/bin/env python
# Contact: Will Wade willwa.de
# Date: April 2013
# Needs mechanize and pybtex
#
# NB: Little error checking going on in this script
# TO-DO: Check last-download-date of bibtex file later than last-modified date on CUL. ? possible
#
# With thanks to https://pypi.python.org/pypi/citeulike_api/0.1.3dev for the login part
#
# 2019-03-05: modified to keep paths short due to Windows filenaming limits
# 2019-03-06: modified to name .json file with username
# 2019-03-06: modified to place .bib file next to .json in working dir, instead of with the attachments
# 2019-03-06: modified to handle multiple (up to 7) attachments per entry
#
import mechanize
import time
import sys
import getopt

from pybtex.database.input import bibtex

# settings
cUser = '...' # set later via cmd line
cPass = '...' # set later via cmd line

opts, args = getopt.getopt(sys.argv[1:], "u:p:", [])
for opt in opts:
    if opt[0] == '-u':
        cUser = opt[1]
    elif opt[0] == '-p':
        cPass = opt[1]

dataDir = './'
localDir = dataDir + 'f/'
culikeJson = 'http://www.citeulike.org/json/user/'+cUser
localJsonPath = dataDir + cUser + '.json'
localBibPath = dataDir + cUser + '.bib';

class CulError(Exception):
    pass

class CiteULikeReader(object):

    MIN_API_WAIT = 5

    def __init__(self, user, password, localDir=''):
        """ Start up... """
        self.cUser = user
        self.cPass = password
        self.loggedin = False
        self.getPDFs = True
        self.cites = ''
        self.localDir = localDir
        self.last_api_access = time.time() - self.MIN_API_WAIT
        self.loginToCiteULike()

    def wait_for_api_limit(self, min_wait=0):
        min_wait = max(min_wait, self.MIN_API_WAIT)
        now = time.time()
        elapsed_time = now - self.last_api_access
        if elapsed_time<min_wait:
            time.sleep(min_wait-elapsed_time)
        self.last_api_access = time.time()

    def loginToCiteULike(self):
        """
        Handle login. This should populate our cookie jar.
        """
        self.browser = mechanize.Browser()
        self.browser.set_handle_robots(False)
        self.browser.addheaders = [
          ("User-agent", 'willwade/willwade@gmail.com citeusyncpy/1.0'),
        ]
        self.browser.open('http://www.citeulike.org/login?from=/')
        self.browser.select_form(name='frm')
        self.browser["username"] = self.cUser
        self.browser["password"] = self.cPass
        self.loggedin = True

        self.wait_for_api_limit()

        try:
            #handle redirects manually to avoid connection flakiness
            self.browser.set_handle_redirect(False)
            resp = self.browser.submit()
        except mechanize.HTTPError, e:
            #This may not work for gold users. See http://www.citeulike.org/groupforum/2949?highlight=41927#msg_41927 for ideas.. feel free to write
            if e.getcode()!=302 : raise e
            next_page = e.info().getheader('Location')
            if next_page == 'http://www.citeulike.org/' :
                #success
                self.logged_in = True
            elif next_page.find('status=login-failed')>=0:
                raise CulError('Login Failed')
            else:
                err = CulError('Unknown login response')
                err.data = e
                raise err
        finally:
            self.browser.set_handle_redirect(True)
        #return ''.join(response.readlines())

    def getBibText(self):
        self.browser.retrieve('http://www.citeulike.org/bibtex/user/'+self.cUser+'?do_username_prefix=0&key_type=3&incl_amazon=0&clean_urls=1&smart_wrap=0&export_attachment_names=t&fieldmap=posted-at:date-added', localBibPath)

    def getJson(self):
        self.browser.retrieve(culikeJson, localJsonPath)

    def downloadPDFS(self):
        #open a bibtex file
        parser = bibtex.Parser()
        bibdata = parser.parse_file(localBibPath)

        #loop through the individual references
        for bib_id in bibdata.entries:
            b = bibdata.entries[bib_id].fields
            # uploads are limited to 2 files and 5 images, so loop up to 7 attachments per entry (2019-03-06)
            for k in range(1, 7):
                try:
                    fieldname = 'citeulike-attachment-' + str(k)
                    filedl = b[fieldname].split(';')[1].strip()
                    file_name = filedl.split('/')[7]
                    filedl = 'http://www.citeulike.org'+filedl
                    try:
                       with open(localDir+file_name): pass
                    except IOError:
                       # Doesn't exist. Download it
                        (filename, headers) = self.browser.retrieve(filedl,localDir+file_name)
                        self.wait_for_api_limit()
                # field may not exist for a reference
                except(KeyError):
                    continue


cureader = CiteULikeReader(cUser, cPass, localDir)
cureader.getJson()
cureader.getBibText()
cureader.downloadPDFS()