SHARE
TWEET

Untitled

a guest Jun 25th, 2019 55 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import spacy.lang.en    
  2. nlp = spacy.lang.en.English()
  3. text = 'I am trying to extract January as efficient as possible. But what is the best solution?'
  4.      
  5. import spacy.tokens
  6. NORM_EXCEPTIONS = {
  7.     'jan': 'MONTH', 'january': 'MONTH'
  8. }
  9. spacy.tokens.Token.set_extension('norm', getter=lambda t: NORM_EXCEPTIONS.get(t.text.lower(), t.norm_))
  10. def time_this():
  11.     doc = nlp(text)
  12.     assert [t for t in doc if t._.norm == 'MONTH'] == [doc[5]]
  13.  
  14. %timeit time_this()
  15.      
  16. import spacy.pipeline
  17. ruler = spacy.pipeline.EntityRuler(nlp)
  18. ruler.phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
  19. ruler.add_patterns([{'label': 'MONTH', 'pattern': 'jan'}, {'label': 'MONTH', 'pattern': 'january'}])
  20. nlp.add_pipe(ruler)
  21. def time_this():
  22.     doc = nlp(text)
  23.     assert [t for t in doc.ents] == [doc[5:6]]
  24. %timeit time_this()
  25.      
  26. import spacy.pipeline
  27. ruler = spacy.pipeline.EntityRuler(nlp)
  28. ruler.add_patterns([{'label': 'MONTH', 'pattern': [{'lower': {'IN': ['jan', 'january']}}]}])
  29. nlp.add_pipe(ruler)
  30. def time_this():
  31.     doc = nlp(text)
  32.     assert [t for t in doc.ents] == [doc[5:6]]
  33. %timeit time_this()
  34.      
  35. import spacy.matcher
  36. phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
  37. phrase_matcher.add('MONTH', None, nlp('jan'), nlp('january'))
  38. def time_this():
  39.     doc = nlp(text)
  40.     matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], phrase_matcher(doc))]
  41.     assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
  42. %timeit time_this()
  43.      
  44. import spacy.matcher
  45. matcher = spacy.matcher.Matcher(nlp.vocab)
  46. matcher.add('MONTH', None, [{'lower': {'IN': ['jan', 'january']}}])
  47. def time_this():
  48.     doc = nlp(text)
  49.     matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], matcher(doc))]
  50.     assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
  51. %timeit time_this()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top