Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- test_string = df.iloc[750]['preferred_qual']
- tokens = nltk.tokenize.word_tokenize(test_string)
- for token in tokens:
- if '/' in token:
- tokens += token.split('/')
- tokens.remove(token)
- tokenizer = nltk.tokenize.MWETokenizer(separator=' ')
- special_words = [
- ('Objective', 'C'),
- ('Computer', 'Science'),
- ('work', 'experience'),
- ('hands-on', 'experience'),
- ('relevant', 'experience'),
- ('practical', 'experience'),
- ('Electrical', 'Engineering'),
- ('web', 'application'),
- ('large', 'software', 'systems'),
- ('next', 'generation'),
- ('back', 'end'),
- ('front', 'end'),
- ('user', 'interface'),
- ('software', 'development'),
- ('communication', 'skills'),
- ('open', 'source'),
- ('general', 'purpose'),
- ('coding', 'languages'),
- ('coding', 'language'),
- ('programming', 'languages'),
- ('programming', 'language'),
- ('embedded', 'systems'),
- ('embedded', 'system'),
- ('device', 'drivers'),
- ('device', 'driver'),
- ('hardware/software', 'integration'),
- ('image', 'processing'),
- ('Machine', 'Learning'),
- ('machine', 'learning'),
- ('deep', 'learning'),
- ('computer', 'vision'),
- ('Customer', 'Relationship', 'Management'),
- ('CRM', 'system'),
- ('project', 'management'),
- ('Big', 'Data'),
- ('presentation', 'skills'),
- ('data', 'patterns'),
- ('business', 'decisions'),
- ('large-scale', 'projects'),
- ('large-scale', 'project'),
- ('documentation', 'skills'),
- ('work', 'collaboratively'),
- ('Finance', 'systems'),
- ('lead', 'discussions'),
- ('key', 'decisions'),
- ('management', 'skills'),
- ('3D', 'rendering'),
- ('GPU', 'optimization'),
- ('rendering', 'engines'),
- ('computational', 'geometry'),
- ('Artificial', 'Intelligence'),
- ('Natural', 'Language'),
- ]
- for pair in special_words:
- tokenizer.add_mwe(pair)
- tokens = tokenizer.tokenize(tokens)
- puncs = ['.', ',', '!', '?', '&', '*', '(', ')']
- # Remove punctuations.
- for punc in puncs:
- for token in tokens:
- if punc == token:
- tokens.remove(token)
- tokens
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement