Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import spacy.lang.en
- nlp = spacy.lang.en.English()
- text = 'I am trying to extract January as efficient as possible. But what is the best solution?'
- import spacy.tokens
- NORM_EXCEPTIONS = {
- 'jan': 'MONTH', 'january': 'MONTH'
- }
- spacy.tokens.Token.set_extension('norm', getter=lambda t: NORM_EXCEPTIONS.get(t.text.lower(), t.norm_))
- def time_this():
- doc = nlp(text)
- assert [t for t in doc if t._.norm == 'MONTH'] == [doc[5]]
- %timeit time_this()
- import spacy.pipeline
- ruler = spacy.pipeline.EntityRuler(nlp)
- ruler.phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
- ruler.add_patterns([{'label': 'MONTH', 'pattern': 'jan'}, {'label': 'MONTH', 'pattern': 'january'}])
- nlp.add_pipe(ruler)
- def time_this():
- doc = nlp(text)
- assert [t for t in doc.ents] == [doc[5:6]]
- %timeit time_this()
- import spacy.pipeline
- ruler = spacy.pipeline.EntityRuler(nlp)
- ruler.add_patterns([{'label': 'MONTH', 'pattern': [{'lower': {'IN': ['jan', 'january']}}]}])
- nlp.add_pipe(ruler)
- def time_this():
- doc = nlp(text)
- assert [t for t in doc.ents] == [doc[5:6]]
- %timeit time_this()
- import spacy.matcher
- phrase_matcher = spacy.matcher.PhraseMatcher(nlp.vocab, attr="LOWER")
- phrase_matcher.add('MONTH', None, nlp('jan'), nlp('january'))
- def time_this():
- doc = nlp(text)
- matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], phrase_matcher(doc))]
- assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
- %timeit time_this()
- import spacy.matcher
- matcher = spacy.matcher.Matcher(nlp.vocab)
- matcher.add('MONTH', None, [{'lower': {'IN': ['jan', 'january']}}])
- def time_this():
- doc = nlp(text)
- matches = [m for m in filter(lambda x: x[0] == doc.vocab.strings['MONTH'], matcher(doc))]
- assert [doc[m[1]:m[2]] for m in matches] == [doc[5:6]]
- %timeit time_this()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement