Recent Posts
C# | 10 sec ago
None | 20 sec ago
None | 43 sec ago
None | 54 sec ago
None | 1 min ago
None | 1 min ago
None | 1 min ago
None | 1 min ago
None | 1 min ago
None | 2 min ago
Sitereport
Find cool info about any domain on the internet?
visit sitereport
Free Subdomains
Want a pastebin.com sub-domain for your community?
learn more...
What is pastebin?
Pastebin is a website that hosts all your text & code on dedicated servers for easy sharing.
learn more...
Learn a little bit about the new Pastebin.com on our help page. hide message
By bbroke on the 5th of Sep 2009 02:59:13 AM Download | Raw | Embed | Report
  1. #!/usr/bin/env python
  2. #
  3. #       culd.py
  4. #
  5. #       Copyright 2009 Robert Lischke <robert.lischke@gmail.com>
  6. #
  7. #       This program is free software; you can redistribute it and/or modify
  8. #       it under the terms of the GNU General Public License as published by
  9. #       the Free Software Foundation; either version 2 of the License, or
  10. #       (at your option) any later version.
  11. #
  12. #       This program is distributed in the hope that it will be useful,
  13. #       but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. #       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15. #       GNU General Public License for more details.
  16. #
  17. #       You should have received a copy of the GNU General Public License
  18. #       along with this program; if not, write to the Free Software
  19. #       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  20. #       MA 02110-1301, USA.
  21.  
  22. from getpass import getpass              # hide entered password
  23. import urllib                            # handle login
  24. import urllib2                           # handle urls
  25. import cookielib                         # handle cookies
  26.  
  27. from pybtex.database.input import bibtex # bibtex parsing
  28.  
  29. '''
  30. TODO:
  31. option: work with destination directory (not just the current one!)
  32. option: resynchronize to citeulike by article-id (i will need a test account!)
  33. option: instead of full download check for local .bib and local .pdf update to
  34. newer version (or after asking)
  35. option: use different formats! jabref, pybliographer, etc.
  36. option: articlefilename = articleid + '.pdf' (not readable, but short)
  37. '''
  38.  
  39. def get_data_from_url(url):
  40.         """Get data located at an URL."""
  41.         try:
  42.                 # TODO: we could add a timeout here
  43.                 response = opener.open(url)
  44.         except IOError, e:
  45.                 if hasattr(e, 'reason'):
  46.                         print 'Failed to reach a server at "' + url + '"'
  47.                         print 'Reason: ', e.reason
  48. #               elif hasattr(e, 'code'):
  49. #                       print 'The server couldn\'t fulfill the request.'
  50. #                       print 'Error code: ', e.code
  51.                 raise Exception()
  52.         data = response.read()
  53.         response.close()
  54.         return data
  55.  
  56. def save_file(filename, data):
  57.         """Save data (e.g. from get_data_from_url() to a file."""
  58.         # TODO: files are always overwritten!!!
  59.         if len(data) > 1:                       # this is buggy! why did i do this?
  60.                 try:
  61.                         f = open(filename,'wb')
  62.                         f.write(data)
  63.                         f.close()
  64.                 except Exception:
  65.                         print 'Error: Writing file "' + filename + '"'
  66.         else:
  67.                 print 'Error: Writing file "' + filename + '" (Empty file)'
  68.         return
  69.  
  70. def citeulike_login(username, password):
  71.         """Login at CiteULike.
  72.  
  73.         Keyword arguments:
  74.         username -- a CiteULike username
  75.         password -- corresponding password
  76.  
  77.         """
  78.         _LOGIN_URL = 'http://www.citeulike.org/login.do'
  79.  
  80.         global opener                                  # global for cookies to work
  81.  
  82.         cj = cookielib.CookieJar()
  83.         opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  84.         logindata = urllib.urlencode({'username' : username, \
  85.            'password' : password})
  86.         try:
  87.                 opener.open(_LOGIN_URL, logindata)
  88.         except IOError, e:
  89.                 if hasattr(e, 'reason'):
  90.                         print 'Failed to reach server at ' + _LOGIN_URL
  91.                         print 'Reason: ', e.reason
  92.                 elif hasattr(e, 'code'):
  93.                         print 'The server couldn\'t fulfill the request.'
  94.                         print 'Error code: ', e.code
  95.         # login successful if cookie is set, len(cj) is 4 after login
  96.         if len(cj) == 0:
  97.                 print 'Error: Invalid username or password'
  98.                 raise Exception()
  99.         else:
  100.                 return
  101.  
  102.  
  103. def get_bib(username):
  104.         """Get CiteULike BibTeX file for username
  105.         and store locally as "username.bib"
  106.  
  107.         Keyword arguments:
  108.         username -- a (valid) CiteULike username
  109.  
  110.         """
  111.         _BIB_URL1 = 'http://www.citeulike.org/bibtex/user/'
  112.         _BIB_URL2 = '?do_username_prefix=0&key_type=4' + \
  113.            '&incl_amazon=0&clean_urls=1&q=' # TODO: offer more options
  114.  
  115.         filename = username + '.bib'
  116.         print 'Retrieving "' + filename + '" from citeulike.org'
  117.         url = _BIB_URL1 + username + _BIB_URL2
  118.         try:
  119.                 data = get_data_from_url(url)
  120.         except Exception:
  121.                 print 'Error: Retrieving "' + filename + '" from citeulike.org'
  122.                 raise Exception()
  123.         save_file(filename, data)
  124.         return filename
  125.  
  126. def parse_bib(filename):
  127.         """Parse local BibTeX file."""
  128.         print 'Parsing "' + filename + '"'
  129.         parser = bibtex.Parser()
  130.         data = parser.parse_file(filename)
  131.         return data
  132.  
  133. def get_article_id(bibkey):
  134.         """Return citulike-article-id stored in BibTeX entry."""
  135.         try:
  136.                 id = bibdata.entries[bibkey].fields['citeulike-article-id']
  137.         except Exception:
  138.                 # every entry should have an article-id, but you never know
  139.                 print 'Error: No citeulike-article-id for BibTeX entry "' + \
  140.                    bibkey + '"'
  141.                 raise Exception()
  142.         return id
  143.  
  144. def build_article_filename(bibkey):
  145.         """Return customized .pdf filename of BibTeX entry."""
  146.         _FILETYPE = '.pdf'
  147.         _MAX_LENGTH = 250
  148.  
  149.         def _validfilename(dirtyfilename):
  150.                 import string
  151.                 _VALID_CHARS = frozenset("-_.() %s%s" % \
  152.                    (string.ascii_letters, string.digits))
  153.                 validfilename = ''.join(c for c in dirtyfilename if c in _VALID_CHARS)
  154.                 return validfilename
  155.  
  156.         try:
  157.                 author = bibdata.entries[bibkey].fields['author']
  158.                 # TODO: handle multiple authors (et.al)
  159.         except Exception:
  160.                 try:
  161.                         editor = bibdata.entries[bibkey].fields['editor']
  162.                         author = editor + ' (Ed.)'
  163.                         # TODO: handle multiple editors (Eds.)
  164.                 except Exception:
  165.                         author = 'Unknown'
  166.         try:
  167.                 year = bibdata.entries[bibkey].fields['year']
  168.         except Exception:
  169.                 year = 'Unknown'
  170.         try:
  171.                 title = bibdata.entries[bibkey].fields['title']
  172.         except Exception:
  173.                 title = 'Unknown'
  174.         filename = author + ' (' + year + ') ' + title
  175.         filename = _validfilename(filename)
  176.         # restrict to 255 characters
  177.         filename = filename[0:_MAX_LENGTH] + _FILETYPE
  178.         return filename
  179.  
  180. def add_entry_for_filename(bibkey, filename):
  181.         """Add BibTeX entry to bibdata referencing the filename. Currently
  182.         only supports JabRef (new format!)."""
  183.         _JABREF_DESCRIPTION = 'citeulikepdf:'
  184.         _JABREF_FILETYPE = ':PDF'
  185.  
  186.         # TODO: the old JabRef format is "pdf = {filename}"
  187.         def     _add_entry(_bibkey, _filename):
  188.                 temp_entry = bibdata.entries[_bibkey].fields
  189.                 temp_entry['file'] = _filename
  190.                 bibdata.entries[_bibkey].fields = temp_entry
  191.                 return
  192.  
  193.         filename_in_bib = ''
  194.         new_filename_in_bib = _JABREF_DESCRIPTION + filename + _JABREF_FILETYPE
  195.  
  196.         try:
  197.                 filename_in_bib = bibdata.entries[bibkey].fields['file']
  198.         except Exception:
  199.                 # add
  200.                 _add_entry(bibkey, new_filename_in_bib)
  201.                 return
  202.         if filename_in_bib != new_filename_in_bib:
  203.                 # update
  204.                 _add_entry(bibkey, new_filename_in_bib)
  205.                 print 'Warning: Updated reference to "' + filename + '" in "' + bibkey + '"'
  206.         return
  207.  
  208. def write_bib(filename, data):
  209.         """Save bilbiography as filename."""
  210.         from pybtex.database.output import bibtex as bibtex2 # bibtex export
  211.         writer = bibtex2.Writer(encoding='utf-8')
  212.         writer.write_file(data, filename)
  213.  
  214. def main():
  215.         """Putting it all together."""
  216.  
  217.         _PDFURL1 = 'http://www.citeulike.org/pdf/user/'
  218.         _PDFURL2 = '/article/'
  219.  
  220.         global bibdata
  221.  
  222.         username = raw_input("CiteULike Username : ")
  223.         password = getpass("CiteULike Password : ")
  224.  
  225.         try:
  226.                 citeulike_login(username, password)
  227.         except Exception:
  228.                 return 1
  229.         try:
  230.                 bibfilename = get_bib(username)
  231.         except Exception:
  232.                 return 1
  233.  
  234.         bibdata = parse_bib(bibfilename)
  235.         bibkeys = bibdata.entries.keys()
  236.  
  237.         i = 0
  238.         numberofbibkeys = len(bibkeys)
  239.         for key in bibkeys:
  240.                 i = i + 1
  241.                 print 'Processing BibTeX key "' + key + '" (' + str(i) + '/' + \
  242.                    str(numberofbibkeys) + ')'
  243.                 try:
  244.                         articleid = get_article_id(key)
  245.                 except Exception:
  246.                         pass
  247.                 else:
  248.                         articleurl = _PDFURL1 + username + _PDFURL2 + articleid
  249.                         articlefilename = build_article_filename(key)
  250.                         try:
  251.                                 articledata = get_data_from_url(articleurl)
  252.                         except Exception:
  253.                                 pass
  254.                         else:
  255.                                 save_file(articlefilename, articledata)
  256.                                 add_entry_for_filename(key, articlefilename)
  257.  
  258.         filename_output = username + '-culd.bib'
  259.         write_bib(filename_output, bibdata)
  260.         print 'Finished! Output written to "' + filename_output + '"'
  261.         return 0
  262.  
  263. if __name__ == '__main__': main()
Submit a correction or amendment below. Make A New Post
To highlight particular lines, prefix each line with @h@
Syntax highlighting:
Post expiration:
Post exposure:
Name / Title:
Email: