Untitled

import re

_RE_MULTIPLE_SPACES = re.compile('\s\s+')
_RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
_RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
_RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')

def clean_affiliations_string(affiliations_string):
    """
    Strips the spaces and collapses multiple spaces.
    """
    return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())

def get_affiliations(affiliations_string):
    """
    Returns a dictionary of affiliations and emails.
    """
    affiliations = {}
    emails = {}

    # First we clean up the spaces in the affilitions string.
    affiliations_string = clean_affiliations_string(affiliations_string)

    while affiliations_string:
        affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
        if index in affiliations:
            raise Exception('Double label.')
        else:
            if '<EMAIL>' in affiliation:
                affiliation, email = _extract_email_from_affiliation(affiliation)
                emails[index] = email
            affiliations[index] = affiliation

    return (affiliations, emails)

def _extract_email_from_affiliation(affiliation):
    """
    Returns a tuple:
    * affiliation without email.
    * email.
    """
    email = None
    match = _RE_EMAIL.search(affiliation)
    if match is None:
        raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
    else:
        email = match.group(1)
        affiliation = affiliation.replace(match.group(0), '')

    return (affiliation, email)

def _extract_first_affiliation(affiliations_string):
    """
    Extract the first affiliation from the affiliations string and returns a
    tuple of:
    * the affiliations string without the first affiliation.
    * the index of the first affiliation.
    * the first affiliation.
    """
    match = _RE_AFFILIATION_PREFIX.match(affiliations_string)

    if match is None:
        raise Exception('Prefix not found: %s' % affiliations_string)

    label = match.group(1)
    index = get_index_from_label(label)

    affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
    opened_parenthesis = 1

    # Now we count the parenthesis and when we find balanced parenthesis, we
    # consider that we got the full affiliation string.
    idx = 0
    for idx, char in enumerate(affiliations_string):
        if char == '(':
            opened_parenthesis += 1
        elif char == ')':
            opened_parenthesis -= 1

        if opened_parenthesis == 0:
            break

    if opened_parenthesis > 0:
        raise Exception('Problem of affiliation with unbalanced parenthesis.')

    # OK. We know where the affiliation is so we extract it and remove it from
    # the global string.
    affiliation = affiliations_string[:idx].strip()
    affiliations_string = affiliations_string[idx:].strip()

    # Finally we check that the global string starts with an affiliation suffix
    # and we clean it.
    if affiliations_string == ')':
        # OK. This was the last affiliation.
        affiliations_string = ''
    elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
        # OK. There is an affiliation following.
        affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
    else:
        # OK. Something went wrong.
        raise Exception('Problem of affiliation with unbalanced parenthesis.')

    return (affiliations_string, index, affiliation)

# Dictionary used to cache the results of the computation for the labels.
_LABEL_INDEX = {}

def get_index_from_label(label):
    """
    Returns an integer index for an affiliation label, ie:
        AA -> 1
        AB -> 2
        BA -> 27
        AAA -> 677
    """
    index = _LABEL_INDEX.get(label)
    if index is None:
        # First we reverse the label.
        label = label[::-1]

        # Then the label is a base-26 representation of the index.
        index = 0
        for idx, char in enumerate(label):
            index += (ord(char) - 64) * (26 ** idx)

        # Because we consider 'A' as 1 and not 0, we need to offset by 26.
        index -= 26

    return index

TESTS = [
        # Simplest case
        ('AA(aff1)', ({1: 'aff1'}, {})),
        # Index does not start at AA.
        ('AB(aff1)', ({2: 'aff1'}, {})),
        # 2 affiliations - ordered
        ('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
        ('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
        ('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
        # 3 affiliations - ordered
        ('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
        ('AA(aff with space)', ({1: 'aff with space'}, {})),
        ('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
        ('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
        ('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
        ('AA(CERN,  Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
        ('AA(  CERN  Geneva  ), AB( Another affiliation    <EMAIL>me@me.com  </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: 'me@me.com'})),
        ('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
        ]

def test_get_affiliations():
    for aff_string, output in TESTS:
        if get_affiliations(aff_string) != output:
            print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
    print 'All %d tests finished.' % len(TESTS)