Untitled

import spacy.lang.en
nlp = spacy.lang.en.English()
text = 'I am trying to extract January as efficient as possible. But what is the best solution?'

import spacy.tokens
NORM_EXCEPTIONS = {
    'jan': 'MONTH', 'january': 'MONTH'
}
spacy.tokens.Token.set_extension('norm', getter=lambda t: NORM_EXCEPTIONS.get(t.text.lower(), t.norm_))
def time_this():
    doc = nlp(text)
    assert [t for t in doc if t._.norm == 'MONTH'] == [doc[5]]

%timeit time_this()

import spacy.pipeline
ruler = spacy.pipeline.EntityRuler(nlp)
ruler.phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
ruler.add_patterns([{'label': 'MONTH', 'pattern': 'jan'}, {'label': 'MONTH', 'pattern': 'january'}])
nlp.add_pipe(ruler)
def time_this():
    doc = nlp(text)
    assert [t for t in doc.ents] == [doc[5:6]]
%timeit time_this()

import spacy.pipeline
ruler = spacy.pipeline.EntityRuler(nlp)
ruler.add_patterns([{'label': 'MONTH', 'pattern': [{'lower': {'IN': ['jan', 'january']}}]}])
nlp.add_pipe(ruler)
def time_this():
    doc = nlp(text)
    assert [t for t in doc.ents] == [doc[5:6]]
%timeit time_this()

import spacy.matcher
phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
phrase_matcher.add('MONTH', None, nlp('jan'), nlp('january'))
def time_this():
    doc = nlp(text)
    matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], phrase_matcher(doc))]
    assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
%timeit time_this()

import spacy.matcher
matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add('MONTH', None, [{'lower': {'IN': ['jan', 'january']}}])
def time_this():
    doc = nlp(text)
    matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], matcher(doc))]
    assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
%timeit time_this()