Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.preprocessing import Imputer, MinMaxScaler
- # Return an oversampled version of features. The returned 2d array contains all rows
- # from features, plus some extra rows selected randomly. num_extra_samples is the number
- # of these extra rows.
- def oversample(features, num_extra_samples):
- # make sure we use all existing rows at least once
- indices = numpy.array(range(0, features.shape[0]))
- # now append num_extra_samples random indices to the list
- extra_indices = numpy.random.randint(0, features.shape[0], num_extra_samples)
- indices = numpy.append(indices, extra_indices)
- oversampled_features = features[indices, :]
- return oversampled_features
- # Returns an undersampled version of features. The returned 2d array contains num_samples
- # random rows from the original features without replacement. The idea is we want to keep
- # the undersampled version as diverse as possible.
- def undersample(features, num_samples):
- indices = numpy.random.choice(features.shape[0], num_samples, replace=False)
- undersampled_features = features[indices, :]
- return undersampled_features
- # transform the features and labels such that we have equal number of rows from each
- # class. First the minority class is over sampled by max_oversampling rate of its original
- # size. The default over sampling rate is 5%. If there is still imbalance, the majority
- # class is undersampled until its size matches the minority class.
- # Prefer this function over balance_weights above.
- def balance_data(features, labels, max_oversampling_rate = 0.05):
- male = 1
- female = 0
- male_indices = (labels[:] == male)
- female_indices = (labels[:] == female)
- male_features = features[male_indices]
- female_features = features[female_indices]
- # should oversample females or males? but do not over sample by more than
- # max_oversampling_rate of original sample, so we avoid too many repetitions of the same
- # data.
- if female_features.shape[0] < male_features.shape[0]:
- num_extra_samples = min(male_features.shape[0] - female_features.shape[0],
- int(female_features.shape[0] * max_oversampling_rate))
- female_features = oversample(female_features, num_extra_samples)
- elif male_features.shape[0] < female_features.shape[0]:
- num_extra_samples = min(female_features.shape[0] - male_features.shape[0],
- int(male_features.shape[0] * max_oversampling_rate))
- male_features = oversample(male_features, num_extra_samples)
- # Now that we over sampled class, we might need to undersample the majority class in
- # case there is still imbalance.
- if female_features.shape[0] > male_features.shape[0]:
- num_samples = male_features.shape[0]
- female_features = undersample(female_features, num_samples)
- elif male_features.shape[0] > female_features.shape[0]:
- num_samples = female_features.shape[0]
- male_features = undersample(male_features, num_samples)
- # generate labels now that we have balanced two classes
- female_labels = numpy.full(female_features.shape[0], female, dtype=numpy.int)
- male_labels = numpy.full(male_features.shape[0], male, dtype=numpy.int)
- # concatenate both features and labels
- balanced_features = numpy.concatenate((female_features, male_features))
- balanced_labels = numpy.concatenate((female_labels, male_labels))
- return (balanced_features, balanced_labels)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement