Untitled

# each item is a list of tokenized job_description
tok = [nltk.word_tokenize(job.lower()) for job in job_desc]

# ignore stop words, bullets, etc. And put it into one list
from nltk.corpus import stopwords
stop = stopwords.words('english')

def clean_token(what_to_clean):
    cleaned_tok = []
    for lists in what_to_clean:
        for item in lists:
            if len(item)>2 and (item not in stop):
                cleaned_tok.append(item)
    return cleaned_tok

freq = nltk.FreqDist(clean_token(tok))
most_freq_words = freq.most_common(100)

[('data', 211),
 ('experience', 78),
 ('learning', 70),
 ('business', 65),
 ('team', 53),
 ('science', 51),
 ('machine', 48),.....