Advertisement
Guest User

citeusyncv3.py

a guest
Mar 6th, 2019
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.10 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # Contact: Will Wade willwa.de
  3. # Date: April 2013
  4. # Needs mechanize and pybtex
  5. #
  6. # NB: Little error checking going on in this script
  7. # TO-DO: Check last-download-date of bibtex file later than last-modified date on CUL. ? possible
  8. #
  9. # With thanks to https://pypi.python.org/pypi/citeulike_api/0.1.3dev for the login part
  10. #
  11. # 2019-03-05: modified to keep paths short due to Windows filenaming limits
  12. # 2019-03-06: modified to name .json file with username
  13. # 2019-03-06: modified to place .bib file next to .json in working dir, instead of with the attachments
  14. # 2019-03-06: modified to handle multiple (up to 7) attachments per entry
  15. #
  16. import mechanize
  17. import time
  18. import sys
  19. import getopt
  20.  
  21. from pybtex.database.input import bibtex
  22.  
  23. # settings
  24. cUser = '...' # set later via cmd line
  25. cPass = '...' # set later via cmd line
  26.  
  27. opts, args = getopt.getopt(sys.argv[1:], "u:p:", [])
  28. for opt in opts:
  29.     if opt[0] == '-u':
  30.         cUser = opt[1]
  31.     elif opt[0] == '-p':
  32.         cPass = opt[1]
  33.  
  34. dataDir = './'
  35. localDir = dataDir + 'f/'
  36. culikeJson = 'http://www.citeulike.org/json/user/'+cUser
  37. localJsonPath = dataDir + cUser + '.json'
  38. localBibPath = dataDir + cUser + '.bib';
  39.  
  40. class CulError(Exception):
  41.     pass
  42.  
  43. class CiteULikeReader(object):
  44.  
  45.     MIN_API_WAIT = 5
  46.    
  47.     def __init__(self, user, password, localDir=''):
  48.         """ Start up... """
  49.         self.cUser = user
  50.         self.cPass = password
  51.         self.loggedin = False
  52.         self.getPDFs = True
  53.         self.cites = ''
  54.         self.localDir = localDir
  55.         self.last_api_access = time.time() - self.MIN_API_WAIT
  56.         self.loginToCiteULike()
  57.  
  58.     def wait_for_api_limit(self, min_wait=0):
  59.         min_wait = max(min_wait, self.MIN_API_WAIT)
  60.         now = time.time()
  61.         elapsed_time = now - self.last_api_access
  62.         if elapsed_time<min_wait:
  63.             time.sleep(min_wait-elapsed_time)
  64.         self.last_api_access = time.time()
  65.  
  66.     def loginToCiteULike(self):
  67.         """
  68.        Handle login. This should populate our cookie jar.
  69.        """
  70.         self.browser = mechanize.Browser()
  71.         self.browser.set_handle_robots(False)
  72.         self.browser.addheaders = [
  73.           ("User-agent", 'willwade/willwade@gmail.com citeusyncpy/1.0'),
  74.         ]
  75.         self.browser.open('http://www.citeulike.org/login?from=/')
  76.         self.browser.select_form(name='frm')
  77.         self.browser["username"] = self.cUser
  78.         self.browser["password"] = self.cPass
  79.         self.loggedin = True
  80.                
  81.         self.wait_for_api_limit()
  82.        
  83.         try:
  84.             #handle redirects manually to avoid connection flakiness
  85.             self.browser.set_handle_redirect(False)
  86.             resp = self.browser.submit()
  87.         except mechanize.HTTPError, e:
  88.             #This may not work for gold users. See http://www.citeulike.org/groupforum/2949?highlight=41927#msg_41927 for ideas.. feel free to write
  89.             if e.getcode()!=302 : raise e
  90.             next_page = e.info().getheader('Location')
  91.             if next_page == 'http://www.citeulike.org/' :
  92.                 #success
  93.                 self.logged_in = True
  94.             elif next_page.find('status=login-failed')>=0:
  95.                 raise CulError('Login Failed')
  96.             else:
  97.                 err = CulError('Unknown login response')
  98.                 err.data = e
  99.                 raise err
  100.         finally:
  101.             self.browser.set_handle_redirect(True)
  102.         #return ''.join(response.readlines())
  103.        
  104.     def getBibText(self):
  105.         self.browser.retrieve('http://www.citeulike.org/bibtex/user/'+self.cUser+'?do_username_prefix=0&key_type=3&incl_amazon=0&clean_urls=1&smart_wrap=0&export_attachment_names=t&fieldmap=posted-at:date-added', localBibPath)
  106.  
  107.     def getJson(self):
  108.         self.browser.retrieve(culikeJson, localJsonPath)
  109.    
  110.     def downloadPDFS(self):
  111.         #open a bibtex file
  112.         parser = bibtex.Parser()
  113.         bibdata = parser.parse_file(localBibPath)
  114.  
  115.         #loop through the individual references
  116.         for bib_id in bibdata.entries:
  117.             b = bibdata.entries[bib_id].fields
  118.             # uploads are limited to 2 files and 5 images, so loop up to 7 attachments per entry (2019-03-06)
  119.             for k in range(1, 7):
  120.                 try:
  121.                     fieldname = 'citeulike-attachment-' + str(k)
  122.                     filedl = b[fieldname].split(';')[1].strip()
  123.                     file_name = filedl.split('/')[7]
  124.                     filedl = 'http://www.citeulike.org'+filedl
  125.                     try:
  126.                        with open(localDir+file_name): pass
  127.                     except IOError:
  128.                        # Doesn't exist. Download it
  129.                         (filename, headers) = self.browser.retrieve(filedl,localDir+file_name)
  130.                         self.wait_for_api_limit()
  131.                 # field may not exist for a reference
  132.                 except(KeyError):
  133.                     continue
  134.  
  135.            
  136.    
  137. cureader = CiteULikeReader(cUser, cPass, localDir)
  138. cureader.getJson()
  139. cureader.getBibText()
  140. cureader.downloadPDFS()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement