Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.stem import WordNetLemmatizer
- wordnet_lemma = WordNetLemmatizer()
- from collections import Counter
- credit_scoring['tokenized_words'] = credit_scoring['purpose'].apply(nltk.word_tokenize)
- def lemmatize_nouns(text):
- lemmas = [wordnet_lemma.lemmatize(token, pos='n') for token in text]
- return lemmas
- def lemmatize_verbs(text):
- lemmas = [wordnet_lemma.lemmatize(token, pos='v') for token in text]
- return lemmas
- credit_scoring['lemmas'] = credit_scoring['tokenized_words'].apply(lemmatize_nouns, lemmatize_verbs)
- #display(credit_scoring['lemmas'].head(20))
- def to_cut_purpose(lemmas):
- if 'house' in lemmas or 'estate' in lemmas or 'property' in lemmas or 'housing' in lemmas:
- return 'real estate'
- elif 'wedding' in lemmas:
- return 'wedding'
- elif 'education'in lemmas or 'university' in lemmas or 'educated' in lemmas:
- return 'education'
- elif 'car' in lemmas:
- return'car'
- else:
- return'unclassified'
- credit_scoring['ultimate_purpose'] = credit_scoring['lemmas'].apply(to_cut_purpose)
- print(Counter(credit_scoring['ultimate_purpose']))
- print(credit_scoring[credit_scoring['ultimate_purpose'] == 'unclassified'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement