Advertisement
Guest User

Untitled

a guest
Jun 26th, 2019
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.69 KB | None | 0 0
  1. # each item is a list of tokenized job_description
  2. tok = [nltk.word_tokenize(job.lower()) for job in job_desc]
  3.  
  4. # ignore stop words, bullets, etc. And put it into one list
  5. from nltk.corpus import stopwords
  6. stop = stopwords.words('english')
  7.  
  8. def clean_token(what_to_clean):
  9. cleaned_tok = []
  10. for lists in what_to_clean:
  11. for item in lists:
  12. if len(item)>2 and (item not in stop):
  13. cleaned_tok.append(item)
  14. return cleaned_tok
  15.  
  16. freq = nltk.FreqDist(clean_token(tok))
  17. most_freq_words = freq.most_common(100)
  18.  
  19. [('data', 211),
  20. ('experience', 78),
  21. ('learning', 70),
  22. ('business', 65),
  23. ('team', 53),
  24. ('science', 51),
  25. ('machine', 48),.....
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement