Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ...
- ## TRAIN
- target = df_HR['Fluktuation'].copy()
- type(target)
- # remove the target feature and redundant features from the dataset
- df_HR.drop(['Fluktuation', 'FTE', 'Mitarbeiternummer',
- 'StandardStunden', 'Volljaehrig'], axis=1, inplace=True)
- print('Size of Full dataset is: {}'.format(df_HR.shape))
- X_train, X_test, y_train, y_test = train_test_split(df_HR,
- target,
- test_size=0.25,
- random_state=7,
- stratify=target)
- ## CREATE MODEL AND STORE IT
- kfold = model_selection.KFold(n_splits=10, random_state=7)
- modelCV = LogisticRegression(solver='liblinear',
- class_weight="balanced",
- random_state=7)
- scoring = 'roc_auc'
- results = model_selection.cross_val_score(
- modelCV, X_train, y_train, cv=kfold, scoring=scoring)
- print(" Logistic Regression algorithm AUC score (STD): %.2f (%.2f)" % (results.mean(), results.std()))
- param_grid = {'C': np.arange(1e-03, 2, 0.01)} # hyper-parameter list to fine-tune
- log_gs = GridSearchCV(LogisticRegression(solver='liblinear', # setting GridSearchCV
- class_weight="balanced",
- random_state=7),
- iid=True,
- return_train_score=True,
- param_grid=param_grid,
- scoring='roc_auc',
- cv=10)
- log_grid = log_gs.fit(X_train, y_train)
- log_opt = log_grid.best_estimator_
- results = log_gs.cv_results_
- model_file_name = '%s/model.pkl' % modelFolder
- joblib.dump(log_gs, model_file_name)
- ## LOAD MODEL AND PREDICT NEW XLSX FILE
- ...
- df_HRE = df_sourcefileE.copy()
- dfColumnsE = df_HRE.columns
- leE = LabelEncoder()
- le_countE = 0
- for col in df_HRE.columns[1:]:
- if df_HRE[col].dtype == 'object':
- if len(list(df_HRE[col].unique())) <= 2:
- leE.fit(df_HRE[col])
- df_HRE[col] = leE.transform(df_HRE[col])
- le_countE += 1
- print('{} columns label encoded.'.format(le_countE))
- df_HRE = pd.get_dummies(df_HRE, drop_first=True)
- #print('df_HRE',df_HRE)
- # import MinMaxScaler
- from sklearn.preprocessing import MinMaxScaler
- scaler = MinMaxScaler(feature_range=(0, 5))
- HRE_col = list(df_HRE.columns)
- #print('datensatz HRE: ', df_HRE)
- HRE_col.remove('Fluktuation')
- for col in HRE_col:
- df_HRE[col] = df_HRE[col].astype(float)
- df_HRE[[col]] = scaler.fit_transform(df_HRE[[col]])
- df_HRE['Fluktuation'] = pd.to_numeric(df_HRE['Fluktuation'], downcast='float')
- targetE = df_HRE['Fluktuation'].copy()
- type(targetE)
- df_HRE.drop(['Fluktuation', 'FTE', 'Mitarbeiternummer',
- 'StandardStunden', 'Volljaehrig'], axis=1, inplace=True)
- # apply the whole pipeline to data
- pred = loaded_model.predict(df_HRE)
- print (pred)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement