Advertisement
Guest User

Untitled

a guest
Jan 23rd, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.17 KB | None | 0 0
  1. import nltk
  2.  
  3. test_string = df.iloc[750]['preferred_qual']
  4.  
  5. tokens = nltk.tokenize.word_tokenize(test_string)
  6.  
  7. for token in tokens:
  8. if '/' in token:
  9. tokens += token.split('/')
  10. tokens.remove(token)
  11.  
  12. tokenizer = nltk.tokenize.MWETokenizer(separator=' ')
  13.  
  14. special_words = [
  15. ('Objective', 'C'),
  16. ('Computer', 'Science'),
  17. ('work', 'experience'),
  18. ('hands-on', 'experience'),
  19. ('relevant', 'experience'),
  20. ('practical', 'experience'),
  21. ('Electrical', 'Engineering'),
  22. ('web', 'application'),
  23. ('large', 'software', 'systems'),
  24. ('next', 'generation'),
  25. ('back', 'end'),
  26. ('front', 'end'),
  27. ('user', 'interface'),
  28. ('software', 'development'),
  29. ('communication', 'skills'),
  30. ('open', 'source'),
  31. ('general', 'purpose'),
  32. ('coding', 'languages'),
  33. ('coding', 'language'),
  34. ('programming', 'languages'),
  35. ('programming', 'language'),
  36. ('embedded', 'systems'),
  37. ('embedded', 'system'),
  38. ('device', 'drivers'),
  39. ('device', 'driver'),
  40. ('hardware/software', 'integration'),
  41. ('image', 'processing'),
  42. ('Machine', 'Learning'),
  43. ('machine', 'learning'),
  44. ('deep', 'learning'),
  45. ('computer', 'vision'),
  46. ('Customer', 'Relationship', 'Management'),
  47. ('CRM', 'system'),
  48. ('project', 'management'),
  49. ('Big', 'Data'),
  50. ('presentation', 'skills'),
  51. ('data', 'patterns'),
  52. ('business', 'decisions'),
  53. ('large-scale', 'projects'),
  54. ('large-scale', 'project'),
  55. ('documentation', 'skills'),
  56. ('work', 'collaboratively'),
  57. ('Finance', 'systems'),
  58. ('lead', 'discussions'),
  59. ('key', 'decisions'),
  60. ('management', 'skills'),
  61. ('3D', 'rendering'),
  62. ('GPU', 'optimization'),
  63. ('rendering', 'engines'),
  64. ('computational', 'geometry'),
  65. ('Artificial', 'Intelligence'),
  66. ('Natural', 'Language'),
  67. ]
  68.  
  69. for pair in special_words:
  70. tokenizer.add_mwe(pair)
  71.  
  72. tokens = tokenizer.tokenize(tokens)
  73.  
  74. puncs = ['.', ',', '!', '?', '&', '*', '(', ')']
  75.  
  76. # Remove punctuations.
  77. for punc in puncs:
  78. for token in tokens:
  79. if punc == token:
  80. tokens.remove(token)
  81.  
  82.  
  83. tokens
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement