Guest User

Untitled

a guest
Jul 22nd, 2018
197
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.75 KB | None | 0 0
  1. import re
  2.  
  3. _RE_MULTIPLE_SPACES = re.compile('\s\s+')
  4. _RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
  5. _RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
  6. _RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')
  7.  
  8. def clean_affiliations_string(affiliations_string):
  9. """
  10. Strips the spaces and collapses multiple spaces.
  11. """
  12. return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())
  13.  
  14. def get_affiliations(affiliations_string):
  15. """
  16. Returns a dictionary of affiliations and emails.
  17. """
  18. affiliations = {}
  19. emails = {}
  20.  
  21. # First we clean up the spaces in the affilitions string.
  22. affiliations_string = clean_affiliations_string(affiliations_string)
  23.  
  24. while affiliations_string:
  25. affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
  26. if index in affiliations:
  27. raise Exception('Double label.')
  28. else:
  29. if '<EMAIL>' in affiliation:
  30. affiliation, email = _extract_email_from_affiliation(affiliation)
  31. emails[index] = email
  32. affiliations[index] = affiliation
  33.  
  34. return (affiliations, emails)
  35.  
  36. def _extract_email_from_affiliation(affiliation):
  37. """
  38. Returns a tuple:
  39. * affiliation without email.
  40. * email.
  41. """
  42. email = None
  43. match = _RE_EMAIL.search(affiliation)
  44. if match is None:
  45. raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
  46. else:
  47. email = match.group(1)
  48. affiliation = affiliation.replace(match.group(0), '')
  49.  
  50. return (affiliation, email)
  51.  
  52. def _extract_first_affiliation(affiliations_string):
  53. """
  54. Extract the first affiliation from the affiliations string and returns a
  55. tuple of:
  56. * the affiliations string without the first affiliation.
  57. * the index of the first affiliation.
  58. * the first affiliation.
  59. """
  60. match = _RE_AFFILIATION_PREFIX.match(affiliations_string)
  61.  
  62. if match is None:
  63. raise Exception('Prefix not found: %s' % affiliations_string)
  64.  
  65. label = match.group(1)
  66. index = get_index_from_label(label)
  67.  
  68. affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
  69. opened_parenthesis = 1
  70.  
  71. # Now we count the parenthesis and when we find balanced parenthesis, we
  72. # consider that we got the full affiliation string.
  73. idx = 0
  74. for idx, char in enumerate(affiliations_string):
  75. if char == '(':
  76. opened_parenthesis += 1
  77. elif char == ')':
  78. opened_parenthesis -= 1
  79.  
  80. if opened_parenthesis == 0:
  81. break
  82.  
  83. if opened_parenthesis > 0:
  84. raise Exception('Problem of affiliation with unbalanced parenthesis.')
  85.  
  86. # OK. We know where the affiliation is so we extract it and remove it from
  87. # the global string.
  88. affiliation = affiliations_string[:idx].strip()
  89. affiliations_string = affiliations_string[idx:].strip()
  90.  
  91. # Finally we check that the global string starts with an affiliation suffix
  92. # and we clean it.
  93. if affiliations_string == ')':
  94. # OK. This was the last affiliation.
  95. affiliations_string = ''
  96. elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
  97. # OK. There is an affiliation following.
  98. affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
  99. else:
  100. # OK. Something went wrong.
  101. raise Exception('Problem of affiliation with unbalanced parenthesis.')
  102.  
  103. return (affiliations_string, index, affiliation)
  104.  
  105. # Dictionary used to cache the results of the computation for the labels.
  106. _LABEL_INDEX = {}
  107.  
  108. def get_index_from_label(label):
  109. """
  110. Returns an integer index for an affiliation label, ie:
  111. AA -> 1
  112. AB -> 2
  113. BA -> 27
  114. AAA -> 677
  115. """
  116. index = _LABEL_INDEX.get(label)
  117. if index is None:
  118. # First we reverse the label.
  119. label = label[::-1]
  120.  
  121. # Then the label is a base-26 representation of the index.
  122. index = 0
  123. for idx, char in enumerate(label):
  124. index += (ord(char) - 64) * (26 ** idx)
  125.  
  126. # Because we consider 'A' as 1 and not 0, we need to offset by 26.
  127. index -= 26
  128.  
  129. return index
  130.  
  131. TESTS = [
  132. # Simplest case
  133. ('AA(aff1)', ({1: 'aff1'}, {})),
  134. # Index does not start at AA.
  135. ('AB(aff1)', ({2: 'aff1'}, {})),
  136. # 2 affiliations - ordered
  137. ('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
  138. ('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
  139. ('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
  140. # 3 affiliations - ordered
  141. ('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
  142. ('AA(aff with space)', ({1: 'aff with space'}, {})),
  143. ('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
  144. ('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
  145. ('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
  146. ('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
  147. ('AA( CERN Geneva ), AB( Another affiliation <EMAIL>me@me.com </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: 'me@me.com'})),
  148. ('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
  149. ]
  150.  
  151. def test_get_affiliations():
  152. for aff_string, output in TESTS:
  153. if get_affiliations(aff_string) != output:
  154. print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
  155. print 'All %d tests finished.' % len(TESTS)
Add Comment
Please, Sign In to add comment