#!/usr/bin/env python
#
# culd.py
#
# Copyright 2009 Robert Lischke <robert.lischke@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
from getpass import getpass # hide entered password
import urllib # handle login
import urllib2 # handle urls
import cookielib # handle cookies
from pybtex.database.input import bibtex # bibtex parsing
'''
TODO:
option: work with destination directory (not just the current one!)
option: resynchronize to citeulike by article-id (i will need a test account!)
option: instead of full download check for local .bib and local .pdf update to
newer version (or after asking)
option: use different formats! jabref, pybliographer, etc.
option: articlefilename = articleid + '.pdf' (not readable, but short)
'''
def get_data_from_url(url):
"""Get data located at an URL."""
try:
# TODO: we could add a timeout here
response = opener.open(url)
except IOError, e:
if hasattr(e, 'reason'):
print 'Failed to reach a server at "' + url + '"'
print 'Reason: ', e.reason
# elif hasattr(e, 'code'):
# print 'The server couldn\'t fulfill the request.'
# print 'Error code: ', e.code
raise Exception()
data = response.read()
response.close()
return data
def save_file(filename, data):
"""Save data (e.g. from get_data_from_url() to a file."""
# TODO: files are always overwritten!!!
if len(data) > 1: # this is buggy! why did i do this?
try:
f = open(filename,'wb')
f.write(data)
f.close()
except Exception:
print 'Error: Writing file "' + filename + '"'
else:
print 'Error: Writing file "' + filename + '" (Empty file)'
return
def citeulike_login(username, password):
"""Login at CiteULike.
Keyword arguments:
username -- a CiteULike username
password -- corresponding password
"""
_LOGIN_URL = 'http://www.citeulike.org/login.do'
global opener # global for cookies to work
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
logindata = urllib.urlencode({'username' : username, \
'password' : password})
try:
opener.open(_LOGIN_URL, logindata)
except IOError, e:
if hasattr(e, 'reason'):
print 'Failed to reach server at ' + _LOGIN_URL
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
# login successful if cookie is set, len(cj) is 4 after login
if len(cj) == 0:
print 'Error: Invalid username or password'
raise Exception()
else:
return
def get_bib(username):
"""Get CiteULike BibTeX file for username
and store locally as "username.bib"
Keyword arguments:
username -- a (valid) CiteULike username
"""
_BIB_URL1 = 'http://www.citeulike.org/bibtex/user/'
_BIB_URL2 = '?do_username_prefix=0&key_type=4' + \
'&incl_amazon=0&clean_urls=1&q=' # TODO: offer more options
filename = username + '.bib'
print 'Retrieving "' + filename + '" from citeulike.org'
url = _BIB_URL1 + username + _BIB_URL2
try:
data = get_data_from_url(url)
except Exception:
print 'Error: Retrieving "' + filename + '" from citeulike.org'
raise Exception()
save_file(filename, data)
return filename
def parse_bib(filename):
"""Parse local BibTeX file."""
print 'Parsing "' + filename + '"'
parser = bibtex.Parser()
data = parser.parse_file(filename)
return data
def get_article_id(bibkey):
"""Return citulike-article-id stored in BibTeX entry."""
try:
id = bibdata.entries[bibkey].fields['citeulike-article-id']
except Exception:
# every entry should have an article-id, but you never know
print 'Error: No citeulike-article-id for BibTeX entry "' + \
bibkey + '"'
raise Exception()
return id
def build_article_filename(bibkey):
"""Return customized .pdf filename of BibTeX entry."""
_FILETYPE = '.pdf'
_MAX_LENGTH = 250
def _validfilename(dirtyfilename):
import string
_VALID_CHARS = frozenset("-_.() %s%s" % \
(string.ascii_letters, string.digits))
validfilename = ''.join(c for c in dirtyfilename if c in _VALID_CHARS)
return validfilename
try:
author = bibdata.entries[bibkey].fields['author']
# TODO: handle multiple authors (et.al)
except Exception:
try:
editor = bibdata.entries[bibkey].fields['editor']
author = editor + ' (Ed.)'
# TODO: handle multiple editors (Eds.)
except Exception:
author = 'Unknown'
try:
year = bibdata.entries[bibkey].fields['year']
except Exception:
year = 'Unknown'
try:
title = bibdata.entries[bibkey].fields['title']
except Exception:
title = 'Unknown'
filename = author + ' (' + year + ') ' + title
filename = _validfilename(filename)
# restrict to 255 characters
filename = filename[0:_MAX_LENGTH] + _FILETYPE
return filename
def add_entry_for_filename(bibkey, filename):
"""Add BibTeX entry to bibdata referencing the filename. Currently
only supports JabRef (new format!)."""
_JABREF_DESCRIPTION = 'citeulikepdf:'
_JABREF_FILETYPE = ':PDF'
# TODO: the old JabRef format is "pdf = {filename}"
def _add_entry(_bibkey, _filename):
temp_entry = bibdata.entries[_bibkey].fields
temp_entry['file'] = _filename
bibdata.entries[_bibkey].fields = temp_entry
return
filename_in_bib = ''
new_filename_in_bib = _JABREF_DESCRIPTION + filename + _JABREF_FILETYPE
try:
filename_in_bib = bibdata.entries[bibkey].fields['file']
except Exception:
# add
_add_entry(bibkey, new_filename_in_bib)
return
if filename_in_bib != new_filename_in_bib:
# update
_add_entry(bibkey, new_filename_in_bib)
print 'Warning: Updated reference to "' + filename + '" in "' + bibkey + '"'
return
def write_bib(filename, data):
"""Save bilbiography as filename."""
from pybtex.database.output import bibtex as bibtex2 # bibtex export
writer = bibtex2.Writer(encoding='utf-8')
writer.write_file(data, filename)
def main():
"""Putting it all together."""
_PDFURL1 = 'http://www.citeulike.org/pdf/user/'
_PDFURL2 = '/article/'
global bibdata
username = raw_input("CiteULike Username : ")
password = getpass("CiteULike Password : ")
try:
citeulike_login(username, password)
except Exception:
return 1
try:
bibfilename = get_bib(username)
except Exception:
return 1
bibdata = parse_bib(bibfilename)
bibkeys = bibdata.entries.keys()
i = 0
numberofbibkeys = len(bibkeys)
for key in bibkeys:
i = i + 1
print 'Processing BibTeX key "' + key + '" (' + str(i) + '/' + \
str(numberofbibkeys) + ')'
try:
articleid = get_article_id(key)
except Exception:
pass
else:
articleurl = _PDFURL1 + username + _PDFURL2 + articleid
articlefilename = build_article_filename(key)
try:
articledata = get_data_from_url(articleurl)
except Exception:
pass
else:
save_file(articlefilename, articledata)
add_entry_for_filename(key, articlefilename)
filename_output = username + '-culd.bib'
write_bib(filename_output, bibdata)
print 'Finished! Output written to "' + filename_output + '"'
return 0
if __name__ == '__main__': main()