Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- _RE_MULTIPLE_SPACES = re.compile('\s\s+')
- _RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
- _RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
- _RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')
- def clean_affiliations_string(affiliations_string):
- """
- Strips the spaces and collapses multiple spaces.
- """
- return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())
- def get_affiliations(affiliations_string):
- """
- Returns a dictionary of affiliations and emails.
- """
- affiliations = {}
- emails = {}
- # First we clean up the spaces in the affilitions string.
- affiliations_string = clean_affiliations_string(affiliations_string)
- while affiliations_string:
- affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
- if index in affiliations:
- raise Exception('Double label.')
- else:
- if '<EMAIL>' in affiliation:
- affiliation, email = _extract_email_from_affiliation(affiliation)
- emails[index] = email
- affiliations[index] = affiliation
- return (affiliations, emails)
- def _extract_email_from_affiliation(affiliation):
- """
- Returns a tuple:
- * affiliation without email.
- * email.
- """
- email = None
- match = _RE_EMAIL.search(affiliation)
- if match is None:
- raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
- else:
- email = match.group(1)
- affiliation = affiliation.replace(match.group(0), '')
- return (affiliation, email)
- def _extract_first_affiliation(affiliations_string):
- """
- Extract the first affiliation from the affiliations string and returns a
- tuple of:
- * the affiliations string without the first affiliation.
- * the index of the first affiliation.
- * the first affiliation.
- """
- match = _RE_AFFILIATION_PREFIX.match(affiliations_string)
- if match is None:
- raise Exception('Prefix not found: %s' % affiliations_string)
- label = match.group(1)
- index = get_index_from_label(label)
- affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
- opened_parenthesis = 1
- # Now we count the parenthesis and when we find balanced parenthesis, we
- # consider that we got the full affiliation string.
- idx = 0
- for idx, char in enumerate(affiliations_string):
- if char == '(':
- opened_parenthesis += 1
- elif char == ')':
- opened_parenthesis -= 1
- if opened_parenthesis == 0:
- break
- if opened_parenthesis > 0:
- raise Exception('Problem of affiliation with unbalanced parenthesis.')
- # OK. We know where the affiliation is so we extract it and remove it from
- # the global string.
- affiliation = affiliations_string[:idx].strip()
- affiliations_string = affiliations_string[idx:].strip()
- # Finally we check that the global string starts with an affiliation suffix
- # and we clean it.
- if affiliations_string == ')':
- # OK. This was the last affiliation.
- affiliations_string = ''
- elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
- # OK. There is an affiliation following.
- affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
- else:
- # OK. Something went wrong.
- raise Exception('Problem of affiliation with unbalanced parenthesis.')
- return (affiliations_string, index, affiliation)
- # Dictionary used to cache the results of the computation for the labels.
- _LABEL_INDEX = {}
- def get_index_from_label(label):
- """
- Returns an integer index for an affiliation label, ie:
- AA -> 1
- AB -> 2
- BA -> 27
- AAA -> 677
- """
- index = _LABEL_INDEX.get(label)
- if index is None:
- # First we reverse the label.
- label = label[::-1]
- # Then the label is a base-26 representation of the index.
- index = 0
- for idx, char in enumerate(label):
- index += (ord(char) - 64) * (26 ** idx)
- # Because we consider 'A' as 1 and not 0, we need to offset by 26.
- index -= 26
- return index
- TESTS = [
- # Simplest case
- ('AA(aff1)', ({1: 'aff1'}, {})),
- # Index does not start at AA.
- ('AB(aff1)', ({2: 'aff1'}, {})),
- # 2 affiliations - ordered
- ('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
- ('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
- ('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
- # 3 affiliations - ordered
- ('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
- ('AA(aff with space)', ({1: 'aff with space'}, {})),
- ('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
- ('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
- ('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
- ('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
- ('AA( CERN Geneva ), AB( Another affiliation <EMAIL>me@me.com </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: 'me@me.com'})),
- ('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
- ]
- def test_get_affiliations():
- for aff_string, output in TESTS:
- if get_affiliations(aff_string) != output:
- print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
- print 'All %d tests finished.' % len(TESTS)
Add Comment
Please, Sign In to add comment