Guest User

Untitled

a guest
May 22nd, 2018
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.29 KB | None | 0 0
  1. from copy import copy
  2.  
  3. import spacy
  4. from spacy.tokens import Doc, Span
  5.  
  6.  
  7. class Template(object):
  8. """Create synthetic NER training data from a template document.
  9.  
  10. Provide a template NER-annotated spacy Doc when instantiating the class. Passing text to the `render` method
  11. populates the templated entity spans, preserving entity labels, and generates a new Doc.
  12.  
  13. Attributes
  14. ----------
  15. template : Doc
  16. NER-annotated template document.
  17. ents : list
  18. Entities of the template document, replaced when rendered. A list of 4-tuples like `[(ent.text, ent.start,
  19. end.end, ent.label)]`.
  20.  
  21.  
  22. Methods
  23. -------
  24. update(template)
  25. Replace the existing template.
  26. render(substitutes)
  27. Populate the template by replacing its entity spans with `substitutes`.
  28. __len__()
  29. Count of template entities.
  30. __str__()
  31. Template text.
  32.  
  33. Notes
  34. -----
  35. Any attributes of the template document and its component elements (e.g., tokens) are discarded, aside from IOB
  36. tags and entity labels.
  37. """
  38.  
  39. def __init__(self, template: Doc):
  40. self._blank_nlp = spacy.blank('en')
  41. self.update(template)
  42.  
  43. def __str__(self):
  44. return self.template.text
  45.  
  46. def __len__(self):
  47. return len(self.ents)
  48.  
  49. def update(self, template: Doc):
  50. self.template = template
  51. assert len(template.ents)
  52. self.ents = self._extract_ents(template)
  53.  
  54. def render(self, substitutes: list) -> Doc:
  55. if len(self.ents) > len(substitutes):
  56. raise ValueError('Need at least as many substitute entities as original entities')
  57. substitutes = copy(substitutes)
  58.  
  59. # Build up output text
  60. output_text = []
  61. output_spans = []
  62. output_idx = 0
  63. for token_idx, token in enumerate(self.template):
  64. if token.ent_iob_ == 'B':
  65. # Replace the first token of the entity span with its substitute
  66. output_spans.append((
  67. output_idx,
  68. output_idx + len(substitutes[0]),
  69. token.ent_type_
  70. ))
  71. # Make the substitution and advance the output index to match
  72. output_idx += len(substitutes[0])
  73. output_text.append(substitutes.pop(0))
  74. if token.ent_iob_ in ['B', 'I'] and self.template[
  75. min(token_idx + 1, len(self.template) - 1)].ent_iob_ != 'I':
  76. # This is the last token of the entity span, so add its whitespace
  77. output_text.append(token.whitespace_)
  78. output_idx += len(token.whitespace_)
  79. if token.ent_iob_ in ['', 'O']:
  80. # Pass non-entity tokens through
  81. output_text.append(token.text_with_ws)
  82. output_idx += len(token.text_with_ws)
  83. output_doc = self._blank_nlp(''.join(output_text))
  84. self._add_ents(output_doc, output_spans)
  85. return output_doc
  86.  
  87. @staticmethod
  88. def _add_ents(doc, spans):
  89. for start, end, label in spans:
  90. span = doc.char_span(start, end, label=label)
  91. if span:
  92. doc.ents = list(doc.ents) + [span]
  93. else:
  94. print('Skipping invalid span!')
  95. return doc
  96.  
  97. @staticmethod
  98. def _extract_ents(doc):
  99. return [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
  100.  
  101.  
  102. def test_one_ent():
  103. nlp = spacy.load('en_core_web_sm')
  104. doc = nlp(u'Cambridge is full of rabbits.')
  105. template = Template(doc)
  106. substitutes = ['New York']
  107. output_doc = template.render(substitutes)
  108. assert isinstance(output_doc, Doc)
  109. assert output_doc.text == 'New York is full of rabbits.'
  110. assert output_doc.ents and isinstance(output_doc.ents[0], Span)
  111. assert output_doc.ents[0].text == 'New York'
  112.  
  113.  
  114. def test_two_ents():
  115. nlp = spacy.load('en_core_web_sm')
  116. doc = nlp(u'New York or San Francisco?')
  117. template = Template(doc)
  118. substitutes = ['Boston', 'Philadelphia']
  119. output_doc = template.render(substitutes)
  120. assert isinstance(output_doc, Doc)
  121. assert output_doc.text == 'Boston or Philadelphia?'
  122. assert len(output_doc.ents) == 2
  123. assert output_doc.ents[0].text == 'Boston'
  124. assert output_doc.ents[1].text == 'Philadelphia'
  125.  
  126.  
  127. if __name__ == '__main__':
  128. test_one_ent()
  129. test_two_ents()
Add Comment
Please, Sign In to add comment