Need a unique gift idea?
A Pastebin account makes a great Christmas gift
SHARE
TWEET

Untitled

a guest Jul 22nd, 2018 166 Never
Upgrade to PRO!
ENDING IN00days00hours00mins00secs
 
  1. import re
  2.  
  3. _RE_MULTIPLE_SPACES = re.compile('\s\s+')
  4. _RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
  5. _RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
  6. _RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')
  7.  
  8. def clean_affiliations_string(affiliations_string):
  9.     """
  10.     Strips the spaces and collapses multiple spaces.
  11.     """
  12.     return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())
  13.  
  14. def get_affiliations(affiliations_string):
  15.     """
  16.     Returns a dictionary of affiliations and emails.
  17.     """
  18.     affiliations = {}
  19.     emails = {}
  20.  
  21.     # First we clean up the spaces in the affilitions string.
  22.     affiliations_string = clean_affiliations_string(affiliations_string)
  23.  
  24.     while affiliations_string:
  25.         affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
  26.         if index in affiliations:
  27.             raise Exception('Double label.')
  28.         else:
  29.             if '<EMAIL>' in affiliation:
  30.                 affiliation, email = _extract_email_from_affiliation(affiliation)
  31.                 emails[index] = email
  32.             affiliations[index] = affiliation
  33.  
  34.     return (affiliations, emails)
  35.  
  36. def _extract_email_from_affiliation(affiliation):
  37.     """
  38.     Returns a tuple:
  39.     * affiliation without email.
  40.     * email.
  41.     """
  42.     email = None
  43.     match = _RE_EMAIL.search(affiliation)
  44.     if match is None:
  45.         raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
  46.     else:
  47.         email = match.group(1)
  48.         affiliation = affiliation.replace(match.group(0), '')
  49.  
  50.     return (affiliation, email)
  51.  
  52. def _extract_first_affiliation(affiliations_string):
  53.     """
  54.     Extract the first affiliation from the affiliations string and returns a
  55.     tuple of:
  56.     * the affiliations string without the first affiliation.
  57.     * the index of the first affiliation.
  58.     * the first affiliation.
  59.     """
  60.     match = _RE_AFFILIATION_PREFIX.match(affiliations_string)
  61.  
  62.     if match is None:
  63.         raise Exception('Prefix not found: %s' % affiliations_string)
  64.  
  65.     label = match.group(1)
  66.     index = get_index_from_label(label)
  67.  
  68.     affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
  69.     opened_parenthesis = 1
  70.  
  71.     # Now we count the parenthesis and when we find balanced parenthesis, we
  72.     # consider that we got the full affiliation string.
  73.     idx = 0
  74.     for idx, char in enumerate(affiliations_string):
  75.         if char == '(':
  76.             opened_parenthesis += 1
  77.         elif char == ')':
  78.             opened_parenthesis -= 1
  79.  
  80.         if opened_parenthesis == 0:
  81.             break
  82.  
  83.     if opened_parenthesis > 0:
  84.         raise Exception('Problem of affiliation with unbalanced parenthesis.')
  85.  
  86.     # OK. We know where the affiliation is so we extract it and remove it from
  87.     # the global string.
  88.     affiliation = affiliations_string[:idx].strip()
  89.     affiliations_string = affiliations_string[idx:].strip()
  90.  
  91.     # Finally we check that the global string starts with an affiliation suffix
  92.     # and we clean it.
  93.     if affiliations_string == ')':
  94.         # OK. This was the last affiliation.
  95.         affiliations_string = ''
  96.     elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
  97.         # OK. There is an affiliation following.
  98.         affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
  99.     else:
  100.         # OK. Something went wrong.
  101.         raise Exception('Problem of affiliation with unbalanced parenthesis.')
  102.  
  103.     return (affiliations_string, index, affiliation)
  104.  
  105. # Dictionary used to cache the results of the computation for the labels.
  106. _LABEL_INDEX = {}
  107.  
  108. def get_index_from_label(label):
  109.     """
  110.     Returns an integer index for an affiliation label, ie:
  111.         AA -> 1
  112.         AB -> 2
  113.         BA -> 27
  114.         AAA -> 677
  115.     """
  116.     index = _LABEL_INDEX.get(label)
  117.     if index is None:
  118.         # First we reverse the label.
  119.         label = label[::-1]
  120.  
  121.         # Then the label is a base-26 representation of the index.
  122.         index = 0
  123.         for idx, char in enumerate(label):
  124.             index += (ord(char) - 64) * (26 ** idx)
  125.  
  126.         # Because we consider 'A' as 1 and not 0, we need to offset by 26.
  127.         index -= 26
  128.  
  129.     return index
  130.  
  131. TESTS = [
  132.         # Simplest case
  133.         ('AA(aff1)', ({1: 'aff1'}, {})),
  134.         # Index does not start at AA.
  135.         ('AB(aff1)', ({2: 'aff1'}, {})),
  136.         # 2 affiliations - ordered
  137.         ('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
  138.         ('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
  139.         ('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
  140.         # 3 affiliations - ordered
  141.         ('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
  142.         ('AA(aff with space)', ({1: 'aff with space'}, {})),
  143.         ('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
  144.         ('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
  145.         ('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
  146.         ('AA(CERN,  Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
  147.         ('AA(  CERN  Geneva  ), AB( Another affiliation    <EMAIL>me@me.com  </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: 'me@me.com'})),
  148.         ('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
  149.         ]
  150.  
  151. def test_get_affiliations():
  152.     for aff_string, output in TESTS:
  153.         if get_affiliations(aff_string) != output:
  154.             print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
  155.     print 'All %d tests finished.' % len(TESTS)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top