Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from pandas import DataFrame, Series
- from sklearn.datasets import make_classification
- from sklearn.model_selection import train_test_split
- from imblearn.over_sampling import SMOTE
- df = make_classification(n_samples = 100000, n_features = 10, n_informative = 2, n_redundant = 0,
- n_repeated = 0, n_classes = 2, n_clusters_per_class = 2, weights = [0.9999, 0.0001],
- flip_y = 0, class_sep = 1.0, hypercube = True, shift = 0.0,
- scale = 1.0, shuffle = True, random_state = 71)
- df_raw = DataFrame(df[0], columns = ['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10'])
- df_raw['Class'] = df[1]
- print(df_raw['Class'].value_counts())
- X = df_raw.iloc[:, 0:10]
- y = df_raw['Class']
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 71)
- smote = SMOTE(sampling_strategy=0.1, random_state=72)
- X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train)
- print(X_train_resampled.shape)
- print(y_train_resampled.shape)
Add Comment
Please, Sign In to add comment