daily pastebin goal
40%
SHARE
TWEET

Untitled

a guest Jan 23rd, 2019 85 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import nltk
  2.  
  3. test_string = df.iloc[750]['preferred_qual']
  4.  
  5. tokens = nltk.tokenize.word_tokenize(test_string)
  6.  
  7. for token in tokens:
  8.     if '/' in token:
  9.         tokens += token.split('/')
  10.         tokens.remove(token)
  11.  
  12. tokenizer = nltk.tokenize.MWETokenizer(separator=' ')
  13.  
  14. special_words = [
  15.     ('Objective', 'C'),
  16.     ('Computer', 'Science'),
  17.     ('work', 'experience'),
  18.     ('hands-on', 'experience'),
  19.     ('relevant', 'experience'),
  20.     ('practical', 'experience'),
  21.     ('Electrical', 'Engineering'),
  22.     ('web', 'application'),
  23.     ('large', 'software', 'systems'),
  24.     ('next', 'generation'),
  25.     ('back', 'end'),
  26.     ('front', 'end'),
  27.     ('user', 'interface'),
  28.     ('software', 'development'),
  29.     ('communication', 'skills'),
  30.     ('open', 'source'),
  31.     ('general', 'purpose'),
  32.     ('coding', 'languages'),
  33.     ('coding', 'language'),
  34.     ('programming', 'languages'),
  35.     ('programming', 'language'),
  36.     ('embedded', 'systems'),
  37.     ('embedded', 'system'),
  38.     ('device', 'drivers'),
  39.     ('device', 'driver'),
  40.     ('hardware/software', 'integration'),
  41.     ('image', 'processing'),
  42.     ('Machine', 'Learning'),
  43.     ('machine', 'learning'),
  44.     ('deep', 'learning'),
  45.     ('computer', 'vision'),
  46.     ('Customer', 'Relationship', 'Management'),
  47.     ('CRM', 'system'),
  48.     ('project', 'management'),
  49.     ('Big', 'Data'),
  50.     ('presentation', 'skills'),
  51.     ('data', 'patterns'),
  52.     ('business', 'decisions'),
  53.     ('large-scale', 'projects'),
  54.     ('large-scale', 'project'),
  55.     ('documentation', 'skills'),
  56.     ('work', 'collaboratively'),
  57.     ('Finance', 'systems'),
  58.     ('lead', 'discussions'),
  59.     ('key', 'decisions'),
  60.     ('management', 'skills'),
  61.     ('3D', 'rendering'),
  62.     ('GPU', 'optimization'),
  63.     ('rendering', 'engines'),
  64.     ('computational', 'geometry'),
  65.     ('Artificial', 'Intelligence'),
  66.     ('Natural', 'Language'),
  67. ]
  68.  
  69. for pair in special_words:
  70.     tokenizer.add_mwe(pair)
  71.  
  72. tokens = tokenizer.tokenize(tokens)
  73.  
  74. puncs = ['.', ',', '!', '?', '&', '*', '(', ')']
  75.  
  76. # Remove punctuations.
  77. for punc in puncs:
  78.     for token in tokens:
  79.         if punc == token:
  80.             tokens.remove(token)
  81.        
  82.        
  83. tokens
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top