Advertisement
Guest User

Untitled

a guest
Jun 24th, 2019
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.88 KB | None | 0 0
  1. ...
  2. ## TRAIN
  3.  
  4. target = df_HR['Fluktuation'].copy()
  5. type(target)
  6.  
  7. # remove the target feature and redundant features from the dataset
  8. df_HR.drop(['Fluktuation', 'FTE', 'Mitarbeiternummer',
  9. 'StandardStunden', 'Volljaehrig'], axis=1, inplace=True)
  10. print('Size of Full dataset is: {}'.format(df_HR.shape))
  11.  
  12. X_train, X_test, y_train, y_test = train_test_split(df_HR,
  13. target,
  14. test_size=0.25,
  15. random_state=7,
  16. stratify=target)
  17. ## CREATE MODEL AND STORE IT
  18.  
  19. kfold = model_selection.KFold(n_splits=10, random_state=7)
  20. modelCV = LogisticRegression(solver='liblinear',
  21. class_weight="balanced",
  22. random_state=7)
  23. scoring = 'roc_auc'
  24. results = model_selection.cross_val_score(
  25. modelCV, X_train, y_train, cv=kfold, scoring=scoring)
  26. print(" Logistic Regression algorithm AUC score (STD): %.2f (%.2f)" % (results.mean(), results.std()))
  27.  
  28. param_grid = {'C': np.arange(1e-03, 2, 0.01)} # hyper-parameter list to fine-tune
  29. log_gs = GridSearchCV(LogisticRegression(solver='liblinear', # setting GridSearchCV
  30. class_weight="balanced",
  31. random_state=7),
  32. iid=True,
  33. return_train_score=True,
  34. param_grid=param_grid,
  35. scoring='roc_auc',
  36. cv=10)
  37.  
  38. log_grid = log_gs.fit(X_train, y_train)
  39. log_opt = log_grid.best_estimator_
  40. results = log_gs.cv_results_
  41.  
  42. model_file_name = '%s/model.pkl' % modelFolder
  43. joblib.dump(log_gs, model_file_name)
  44.  
  45. ## LOAD MODEL AND PREDICT NEW XLSX FILE
  46. ...
  47. df_HRE = df_sourcefileE.copy()
  48. dfColumnsE = df_HRE.columns
  49.  
  50. leE = LabelEncoder()
  51.  
  52. le_countE = 0
  53.  
  54. for col in df_HRE.columns[1:]:
  55. if df_HRE[col].dtype == 'object':
  56. if len(list(df_HRE[col].unique())) <= 2:
  57. leE.fit(df_HRE[col])
  58. df_HRE[col] = leE.transform(df_HRE[col])
  59. le_countE += 1
  60. print('{} columns label encoded.'.format(le_countE))
  61.  
  62. df_HRE = pd.get_dummies(df_HRE, drop_first=True)
  63. #print('df_HRE',df_HRE)
  64.  
  65. # import MinMaxScaler
  66. from sklearn.preprocessing import MinMaxScaler
  67. scaler = MinMaxScaler(feature_range=(0, 5))
  68. HRE_col = list(df_HRE.columns)
  69. #print('datensatz HRE: ', df_HRE)
  70. HRE_col.remove('Fluktuation')
  71. for col in HRE_col:
  72. df_HRE[col] = df_HRE[col].astype(float)
  73. df_HRE[[col]] = scaler.fit_transform(df_HRE[[col]])
  74. df_HRE['Fluktuation'] = pd.to_numeric(df_HRE['Fluktuation'], downcast='float')
  75.  
  76. targetE = df_HRE['Fluktuation'].copy()
  77. type(targetE)
  78.  
  79. df_HRE.drop(['Fluktuation', 'FTE', 'Mitarbeiternummer',
  80. 'StandardStunden', 'Volljaehrig'], axis=1, inplace=True)
  81.  
  82.  
  83. # apply the whole pipeline to data
  84. pred = loaded_model.predict(df_HRE)
  85. print (pred)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement