Advertisement
Guest User

Untitled

a guest
Jun 25th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.82 KB | None | 0 0
  1. import spacy.lang.en
  2. nlp = spacy.lang.en.English()
  3. text = 'I am trying to extract January as efficient as possible. But what is the best solution?'
  4.  
  5. import spacy.tokens
  6. NORM_EXCEPTIONS = {
  7. 'jan': 'MONTH', 'january': 'MONTH'
  8. }
  9. spacy.tokens.Token.set_extension('norm', getter=lambda t: NORM_EXCEPTIONS.get(t.text.lower(), t.norm_))
  10. def time_this():
  11. doc = nlp(text)
  12. assert [t for t in doc if t._.norm == 'MONTH'] == [doc[5]]
  13.  
  14. %timeit time_this()
  15.  
  16. import spacy.pipeline
  17. ruler = spacy.pipeline.EntityRuler(nlp)
  18. ruler.phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
  19. ruler.add_patterns([{'label': 'MONTH', 'pattern': 'jan'}, {'label': 'MONTH', 'pattern': 'january'}])
  20. nlp.add_pipe(ruler)
  21. def time_this():
  22. doc = nlp(text)
  23. assert [t for t in doc.ents] == [doc[5:6]]
  24. %timeit time_this()
  25.  
  26. import spacy.pipeline
  27. ruler = spacy.pipeline.EntityRuler(nlp)
  28. ruler.add_patterns([{'label': 'MONTH', 'pattern': [{'lower': {'IN': ['jan', 'january']}}]}])
  29. nlp.add_pipe(ruler)
  30. def time_this():
  31. doc = nlp(text)
  32. assert [t for t in doc.ents] == [doc[5:6]]
  33. %timeit time_this()
  34.  
  35. import spacy.matcher
  36. phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
  37. phrase_matcher.add('MONTH', None, nlp('jan'), nlp('january'))
  38. def time_this():
  39. doc = nlp(text)
  40. matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], phrase_matcher(doc))]
  41. assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
  42. %timeit time_this()
  43.  
  44. import spacy.matcher
  45. matcher = spacy.matcher.Matcher(nlp.vocab)
  46. matcher.add('MONTH', None, [{'lower': {'IN': ['jan', 'january']}}])
  47. def time_this():
  48. doc = nlp(text)
  49. matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], matcher(doc))]
  50. assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
  51. %timeit time_this()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement