Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_feat_dist(feature):
- # truncate to max size first
- # must be from each category
- sample_sur, _, sample_not, _ = train_test_split(train_sur, train_not[:max_sample_size], train_size=0.5, random_state=42, shuffle=True)
- print(len(sample_sur.values))
- print(len(sample_not.values))
- feature_sur = sample_sur[feature]
- feature_not = sample_not[feature]
- results = {}
- for cat in sorted(train_set[feature].unique()):
- sur_num = len(feature_sur[feature_sur==cat].values) / max_sample_size
- not_num = len(feature_not[feature_not==cat].values) / max_sample_size
- results[cat] = [sur_num, not_num]
- return results
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement