jack06215

[imblearn] SMOTE dealing with inbalanced datasets

May 24th, 2020
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.10 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from pandas import DataFrame, Series
  4.  
  5. from sklearn.datasets import make_classification
  6. from sklearn.model_selection import train_test_split
  7.  
  8. from imblearn.over_sampling import SMOTE
  9. df = make_classification(n_samples = 100000, n_features = 10, n_informative = 2, n_redundant = 0,
  10.                          n_repeated = 0, n_classes = 2, n_clusters_per_class = 2, weights = [0.9999, 0.0001],
  11.                          flip_y = 0, class_sep = 1.0, hypercube = True, shift = 0.0,
  12.                          scale = 1.0, shuffle = True, random_state = 71)
  13.  
  14. df_raw = DataFrame(df[0], columns = ['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10'])
  15. df_raw['Class'] = df[1]
  16.  
  17. print(df_raw['Class'].value_counts())
  18.  
  19. X = df_raw.iloc[:, 0:10]
  20. y = df_raw['Class']
  21. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 71)
  22.  
  23. smote = SMOTE(sampling_strategy=0.1, random_state=72)
  24. X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train)
  25.  
  26. print(X_train_resampled.shape)
  27. print(y_train_resampled.shape)
Add Comment
Please, Sign In to add comment