Advertisement
garthgriffin

dataversehelper.py

Oct 12th, 2015
161
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.14 KB | None | 0 0
  1. """dataversehelper.py
  2.  
  3. Author: Garth Griffin (http://garthgriffin.com)
  4. Date: May 21, 2015
  5.  
  6. This file is part of Antislavery Petitions Massachusetts Dataverse
  7. https://github.com/garthg/petitions-dataverse .
  8.  
  9. Licensed under GPL v3 as free software with no warranty, express or implied.
  10. See full license at:
  11. https://github.com/garthg/petitions-dataverse/blob/master/LICENSE .
  12. """
  13. import sys
  14. import requests
  15. import urllib
  16. import os
  17. import json
  18. import httplib2
  19. from xml import etree
  20. import zipfile
  21. import io
  22.  
  23.  
  24. class DataverseHelper(object):
  25.   dataverse_server = 'dataverse.harvard.edu'  # Production.
  26.  
  27.   def __init__(self, api_key, entity_id_cache_file=None, server=None):
  28.     self.api_key = api_key
  29.     self.dataverse_server = (server if server else
  30.         DataverseHelper.dataverse_server)
  31.     self.api_base_url = 'https://%s/api' % self.dataverse_server
  32.     self.query_base_url = '%s/search?key=%s&show_entity_ids=true&q=' % (
  33.         self.api_base_url, api_key)
  34.     self._edit_media_uri_base = None
  35.     self.entity_id_cache_file = entity_id_cache_file
  36.     if entity_id_cache_file is not None:
  37.       if os.path.isfile(entity_id_cache_file):
  38.         with open(entity_id_cache_file) as fid:
  39.           cache_string = fid.read()
  40.           self.entity_id_cache = json.loads(cache_string)
  41.           print 'Loaded %d cache entries from file: %s' % (
  42.               len(self.entity_id_cache), entity_id_cache_file)
  43.       else:
  44.         print >>sys.stderr, 'Initializing empty cache from file: %s' % (
  45.             entity_id_cache_file)
  46.         self.entity_id_cache = {}
  47.     else:
  48.       print >>sys.stderr, 'Initializing memory-only cache.'
  49.       self.entity_id_cache = {}
  50.     self.dataverse_connection = None
  51.  
  52.   @staticmethod
  53.   def _httpget(url, asjson=True):
  54.     resp, content = httplib2.Http().request(url)
  55.     if not resp['status'] == '200':
  56.       print content
  57.       raise RuntimeError('URL request failed: %s' % url)
  58.     if asjson:
  59.       content = json.loads(content)
  60.     return content
  61.  
  62.   def query(self, query, unique=True):
  63.     url = self.query_base_url + urllib.quote(query)
  64.     print 'Run query%s: %s --> %s' % ((' (unique)' if unique else ''), query,
  65.         url)
  66.     data = self._httpget(url)
  67.     if not data.get('status') == 'OK':
  68.       raise RuntimeError('Query failed bad status "%s": %s' % (
  69.         data.get('status'), url))
  70.     result_container = data.get('data')
  71.     if not result_container:
  72.       raise RuntimeError('Query failed with no data: %s' % url)
  73.     results = result_container.get('items')
  74.     if results is None:
  75.       raise RuntimeError('Query failed with no items: %s' % url)
  76.     if unique and len(results) == 0:
  77.       return None
  78.     if unique and len(results) > 1:
  79.       raise RuntimeError('Multiple matches for unique query: %s' % url)
  80.     if unique:
  81.       return results[0]
  82.     return results
  83.  
  84.   def get_study_header(self, doi):
  85.     result = self.query('dsPersistentId:"'+doi.replace(':','\\:')+'"', True)
  86.     if result is None:
  87.       return None
  88.     if not result.get('global_id') == doi:
  89.       raise RuntimeError('Query returned unmatched doi "%s": %s' % (
  90.         result.get('global_id'), url))
  91.     return result
  92.  
  93.   def _get_entity_id(self, doi):
  94.     data = self.get_study_header(doi)
  95.     if data is None:
  96.       return None
  97.     return data['entity_id']
  98.  
  99.   def set_entity_id(self, doi, entity_id):
  100.     self.entity_id_cache[doi] = entity_id
  101.     if self.entity_id_cache_file is not None:
  102.       with open(self.entity_id_cache_file, 'w') as fid:
  103.         fid.write(json.dumps(self.entity_id_cache, indent=2))
  104.       print >>sys.stderr, 'Wrote %d entries to cache file: %s' % (
  105.           len(self.entity_id_cache), self.entity_id_cache_file)
  106.  
  107.   def get_entity_id(self, doi):
  108.     if not doi in self.entity_id_cache:
  109.       entity_id = self._get_entity_id(doi)
  110.       if entity_id is not None:
  111.         self.set_entity_id(doi, entity_id)
  112.       return entity_id
  113.     return self.entity_id_cache[doi]
  114.  
  115.   def get_study_metadata(self, doi, version='latest'):
  116.     entity_id = self.get_entity_id(doi)
  117.     if entity_id is None:
  118.       raise ValueError('Dataset DOI %s not found.' % doi)
  119.     # URL adapted from Python wrapper:
  120.     # https://github.com/IQSS/dataverse-client-python/blob/master/dataverse/dataset.py
  121.     url = '{0}/datasets/{1}/versions/:{2}?key={3}'.format(
  122.         self.api_base_url,
  123.         entity_id,
  124.         version,
  125.         self.api_key
  126.         )
  127.     data = self._httpget(url)
  128.     if not data['status'] == 'OK':
  129.       raise RuntimeError('Bad Dataverse API status %s from URL: %s' % (
  130.         data['status'], url))
  131.     if not 'data' in data or not data['data']:
  132.       raise RuntimeError('No Dataverse API data returned from URL: %s' % (url))
  133.     return data['data']
  134.  
  135.   def update_study_metadata(self, doi, metadata):
  136.     entity_id = self.get_entity_id(doi)
  137.     # URL adapted from Python wrapper:
  138.     # https://github.com/IQSS/dataverse-client-python/blob/master/dataverse/dataset.py
  139.     url = '{0}/datasets/{1}/versions/:draft'.format(
  140.         self.api_base_url,
  141.         entity_id
  142.         )
  143.     resp = requests.put(
  144.         url,
  145.         headers={'Content-type': 'application/json'},
  146.         data=json.dumps(metadata),
  147.         params={'key': self.api_key},
  148.         )
  149.     if resp.status_code != 200:
  150.       raise OperationFailedError('JSON metadata could not be updated.')
  151.     updated_metadata = resp.json()['data']
  152.     return updated_metadata
  153.  
  154.   def _get_current_dataverse(self):
  155.     import dataverse  # Import here in case not installed.
  156.     conn = dataverse.Connection(self.dataverse_server, self.api_key)
  157.     dv_list = conn.get_dataverses()
  158.     if not dv_list:
  159.       raise RuntimeError('No dataverse found.')
  160.     if len(dv_list) > 1:
  161.       raise RuntimeError('Too many available dataverses.')
  162.     dv = dv_list[0]
  163.     print 'Connected to dataverse'
  164.     return dv
  165.  
  166.   def _get_edit_media_uri_base(self):
  167.     print 'Loading edit uri from collection info'
  168.     try:
  169.       import dataverse
  170.     except ImportError:
  171.       print >>sys.stderr, 'WARNING: Continuing without "dataverse" module.'
  172.       return 'https://dataverse.harvard.edu/dvn/api/data-deposit/v1.1/swordv2/edit/study/'
  173.     conn = dataverse.Connection(self.dataverse_server, self.api_key)
  174.     #edit_uri = conn.sword_base_url+'/edit/study/'  # Not sure why this breaks.
  175.     #print edit_uri
  176.     #return edit_uri
  177.     # This is fragile, it would be better to get the edit_uri some other way.
  178.     dv = self._get_current_dataverse()
  179.     dv_info = dv.get_collection_info()
  180.     print 'Loaded collection info'
  181.     entry_text = dv_info[dv_info.find('<entry'):dv_info.find('</entry>')+8]
  182.     root = etree.ElementTree.fromstring(entry_text)
  183.     links = filter(lambda x: x.tag == 'link' and x.get('rel') == 'edit-media',
  184.         root)
  185.     if not links:
  186.       raise RuntimeError('Failed to find <link rel="edit-media"> element.')
  187.     link = links[0]
  188.     full_href = link.get('href')
  189.     base = full_href[:full_href.find('/study/doi:')+7]
  190.     print 'Parsed edit URL base: %s' % base
  191.     return base
  192.  
  193.   def get_edit_media_uri(self, doi):
  194.     if not self._edit_media_uri_base:
  195.       self._edit_media_uri_base = self._get_edit_media_uri_base()
  196.     return self._edit_media_uri_base+doi
  197.  
  198.   def get_doi_from_search(self, query):
  199.     result = self.query(query, True)
  200.     if not result:
  201.       print 'No result for query: %s' % query
  202.       return None
  203.     if not 'global_id' in result or not result['global_id']:
  204.       raise RuntimeError('Bad result field global_id from query: %s' % query)
  205.     return result['global_id']
  206.  
  207.   def create_blank_study_BROKEN(self):
  208.     import dataverse  # Import here in case it's not installed.
  209.     if not self.dataverse_connection:
  210.       self.dataverse_connection = dataverse.Connection(self.dataverse_server,
  211.           self.api_key)
  212.     dv_list = self.dataverse_connection.get_dataverses()
  213.     if not dv_list:
  214.       raise RuntimeError('No dataverse found.')
  215.     if len(dv_list) > 1:
  216.       raise RuntimeError('Too many available dataverses.')
  217.     curr_dv = dv_list[0]
  218.     dataset = curr_dv.create_dataset(
  219.         'untitled',
  220.         'This study is intentionally blank, set up via automation.',
  221.         'Dataverse API Creator')
  222.     if not dataset:
  223.       raise RuntimeError('Failed to create dataset.')
  224.     self.set_entity_id(dataset.doi, dataset.id)
  225.     return dataset
  226.  
  227.   def create_and_publish_new_study(self,
  228.       title='untitled',
  229.       description='This study is intentionally blank, set up via automation.',
  230.       creator='Dataverse API creator'
  231.       ):
  232.     import dataverse
  233.     dv = self._get_current_dataverse()
  234.     # Set up the Dataset to be created.
  235.     dataset = dataverse.Dataset(
  236.         title=title,
  237.         description=description,
  238.         creator=creator
  239.         )
  240.     # Add the Dataset to the Dataverse.
  241.     # TODO remove these two lines and just use dv.collection.get('href')
  242.     url = dv.collection.get('href').replace('beta.harvard.edu', 'beta.dataverse.org')
  243.     print 'resp = requests.post(%r,\n  data="<entry xmlns=...>...</entry>",\n  headers={"Content-type": "application/atom+xml"},\n  auth=(my_api_key, None))' % (url)
  244.     resp = requests.post(
  245.         #dv.collection.get('href'),
  246.         url,
  247.         data=dataset.get_entry(),
  248.         headers={'Content-type': 'application/atom+xml'},
  249.         auth=dv.connection.auth,
  250.         )
  251.     if resp.status_code != 201:
  252.       raise RuntimeError('Failed to add newly created dataset to Dataverse.')
  253.     # Parse the content ID from the result.
  254.     content_id = dataverse.utils.get_element(resp.content, 'id')
  255.     doi = content_id.text.split('study/')[-1]
  256.     # Publish the newly-uploaded Dataset so we can find its entity ID later.
  257.     edit_uri = dataverse.utils.get_element(resp.content,
  258.         tag='link',
  259.         attribute='rel',
  260.         attribute_value='edit').get('href')
  261.     resp = requests.post(
  262.         edit_uri,
  263.         headers={'In-Progress': 'false', 'Content-Length': 0},
  264.         auth=dv.connection.auth,
  265.         )
  266.     if resp.status_code != 200:
  267.       print 'ERROR: response %s: %s' % (resp.status_code, resp.content)
  268.       raise RuntimeError('The Dataset could not be published.')
  269.     dataset = dv.get_dataset_by_doi(doi)
  270.     dataset._id = self.get_entity_id(doi)
  271.     return dataset
  272.  
  273.  
  274. if __name__ == '__main__':
  275.   api_key = sys.argv[1]
  276.  
  277.   '''
  278.  # This block for test server.
  279.  from datetime import datetime
  280.  test_api_key = 'f4980a73-7baa-4713-92d5-dfb53ff35002'
  281.  dvhelper = DataverseHelper(test_api_key, None, True)
  282.  if dvhelper.get_doi_from_search('notreal'):
  283.    raise RuntimeError('Problem!')
  284.  test_description = 'This is a description '+str(datetime.utcnow())
  285.  print test_description
  286.  dataset = dvhelper.create_and_publish_new_study('new title', test_description)
  287.  print dataset
  288.  print dataset.doi
  289.  print dataset._id
  290.  test_metadata = json.loads(
  291.      open('testdata/jsonformatter_testdata.json').read())
  292.  #dvhelper.update_study_metadata(doi, test_metadata)
  293.  dataset.update_metadata(test_metadata)
  294.  dataset.publish()
  295.  sys.exit(0)
  296.  '''
  297.  
  298.  
  299.   cache_file = 'dataverse_entity_ids.json'
  300.   dvhelper = DataverseHelper(api_key, cache_file)
  301.  
  302.   '''
  303.  # This block for creating an empty study.
  304.  doi = dvhelper.create_blank_study()
  305.  print 'Created study'
  306.  print 'Created: %s' % doi
  307.  print dvhelper.get_study_metadata(doi)
  308.  sys.exit(0)
  309.  '''
  310.  
  311.  
  312.   '''
  313.  # Read DOIs from a file.
  314.  doi_file = sys.argv[2]
  315.  with open(doi_file) as fid:
  316.    dois = filter(None, [x.strip() for x in fid.readlines()])
  317.  '''
  318.  
  319.   # Pass one DOI on the command line.
  320.   dois = [sys.argv[2]]
  321.  
  322.   '''
  323.  # Run a query from the command line to find a DOI.
  324.  query = sys.argv[2]
  325.  doi_result = dvhelper.get_doi_from_search(query)
  326.  print '%s -> %s' % (query, doi_result)
  327.  if not doi_result: raise RuntimeError('Failed to find DOI')
  328.  dois = [doi_result]
  329.  '''
  330.  
  331.   ctr = 0
  332.   for doi in dois:
  333.     ctr += 1
  334.     print '%d/%d' % (ctr, len(dois))
  335.     res = dvhelper.get_entity_id(doi)
  336.     print '%s -> %s' % (doi, res)
  337.     print json.dumps(dvhelper.get_study_metadata(doi), indent=2)
  338.     #print json.dumps(dvhelper.get_study_header(doi), indent=2)
  339.     #header = dvhelper.get_study_header(doi)
  340.     #print header['name']
  341.     #print header['url']
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement