Advertisement
Guest User

Untitled

a guest
Apr 19th, 2015
227
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.30 KB | None | 0 0
  1. from sklearn.preprocessing import Imputer, MinMaxScaler
  2.  
  3. # Return an oversampled version of features. The returned 2d array contains all rows
  4. # from features, plus some extra rows selected randomly. num_extra_samples is the number
  5. # of these extra rows.
  6. def oversample(features, num_extra_samples):
  7. # make sure we use all existing rows at least once
  8. indices = numpy.array(range(0, features.shape[0]))
  9. # now append num_extra_samples random indices to the list
  10. extra_indices = numpy.random.randint(0, features.shape[0], num_extra_samples)
  11. indices = numpy.append(indices, extra_indices)
  12. oversampled_features = features[indices, :]
  13. return oversampled_features
  14.  
  15. # Returns an undersampled version of features. The returned 2d array contains num_samples
  16. # random rows from the original features without replacement. The idea is we want to keep
  17. # the undersampled version as diverse as possible.
  18. def undersample(features, num_samples):
  19. indices = numpy.random.choice(features.shape[0], num_samples, replace=False)
  20. undersampled_features = features[indices, :]
  21. return undersampled_features
  22.  
  23.  
  24. # transform the features and labels such that we have equal number of rows from each
  25. # class. First the minority class is over sampled by max_oversampling rate of its original
  26. # size. The default over sampling rate is 5%. If there is still imbalance, the majority
  27. # class is undersampled until its size matches the minority class.
  28. # Prefer this function over balance_weights above.
  29. def balance_data(features, labels, max_oversampling_rate = 0.05):
  30. male = 1
  31. female = 0
  32. male_indices = (labels[:] == male)
  33. female_indices = (labels[:] == female)
  34.  
  35. male_features = features[male_indices]
  36. female_features = features[female_indices]
  37.  
  38. # should oversample females or males? but do not over sample by more than
  39. # max_oversampling_rate of original sample, so we avoid too many repetitions of the same
  40. # data.
  41. if female_features.shape[0] < male_features.shape[0]:
  42. num_extra_samples = min(male_features.shape[0] - female_features.shape[0],
  43. int(female_features.shape[0] * max_oversampling_rate))
  44. female_features = oversample(female_features, num_extra_samples)
  45.  
  46. elif male_features.shape[0] < female_features.shape[0]:
  47. num_extra_samples = min(female_features.shape[0] - male_features.shape[0],
  48. int(male_features.shape[0] * max_oversampling_rate))
  49. male_features = oversample(male_features, num_extra_samples)
  50.  
  51. # Now that we over sampled class, we might need to undersample the majority class in
  52. # case there is still imbalance.
  53. if female_features.shape[0] > male_features.shape[0]:
  54. num_samples = male_features.shape[0]
  55. female_features = undersample(female_features, num_samples)
  56.  
  57. elif male_features.shape[0] > female_features.shape[0]:
  58. num_samples = female_features.shape[0]
  59. male_features = undersample(male_features, num_samples)
  60.  
  61. # generate labels now that we have balanced two classes
  62. female_labels = numpy.full(female_features.shape[0], female, dtype=numpy.int)
  63. male_labels = numpy.full(male_features.shape[0], male, dtype=numpy.int)
  64.  
  65. # concatenate both features and labels
  66. balanced_features = numpy.concatenate((female_features, male_features))
  67. balanced_labels = numpy.concatenate((female_labels, male_labels))
  68. return (balanced_features, balanced_labels)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement