Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # each item is a list of tokenized job_description
- tok = [nltk.word_tokenize(job.lower()) for job in job_desc]
- # ignore stop words, bullets, etc. And put it into one list
- from nltk.corpus import stopwords
- stop = stopwords.words('english')
- def clean_token(what_to_clean):
- cleaned_tok = []
- for lists in what_to_clean:
- for item in lists:
- if len(item)>2 and (item not in stop):
- cleaned_tok.append(item)
- return cleaned_tok
- freq = nltk.FreqDist(clean_token(tok))
- most_freq_words = freq.most_common(100)
- [('data', 211),
- ('experience', 78),
- ('learning', 70),
- ('business', 65),
- ('team', 53),
- ('science', 51),
- ('machine', 48),.....
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement