Advertisement
Guest User

Untitled

a guest
Dec 7th, 2019
327
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.56 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import click
  3. import logging
  4. from dotenv import find_dotenv, load_dotenv
  5. import pandas as pd
  6. import numpy as np
  7. from sklearn.compose import ColumnTransformer
  8. from sklearn.pipeline import Pipeline
  9. from sklearn.impute import SimpleImputer
  10. from sklearn.preprocessing import StandardScaler, OneHotEncoder
  11. from sklearn.ensemble import RandomForestClassifier
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn import metrics
  14. import json
  15. import pathlib
  16. from pathlib import Path
  17.  
  18.  
  19. @click.command()
  20. @click.argument('test_filepath', type=click.Path(exists=True))
  21. @click.argument('train_filepath', type=click.Path(exists=True))
  22. @click.argument('output_file_directory', type=click.Path())
  23. def main(test_filepath, train_filepath, output_file_directory):
  24.     logger = logging.getLogger(__name__)
  25.     logger.info('Reading data files')
  26.     df_train = pd.read_csv(pathlib.Path(train_filepath), delimiter=";")
  27.     df_test = pd.read_csv(pathlib.Path(test_filepath), delimiter=";")
  28.     logger.info("Training data size = {0}, testing data size = {1}".format(df_train.size, df_test.size))
  29.  
  30.     logger.info('Constructing pipeline')
  31.     numeric_features = [
  32.         'age',
  33.         'balance',
  34.         'day',
  35.         'campaign',
  36.         'pdays',
  37.         'previous',
  38.     ]
  39.     categorical_features = [
  40.         'job',
  41.         'marital',
  42.         'education',
  43.         'default',
  44.         'housing',
  45.         'loan',
  46.         'contact',
  47.         'month',
  48.         'campaign',
  49.         'pdays',
  50.         'previous',
  51.     ]
  52.     numeric_transformer_pipe = Pipeline(steps=[
  53.         ('imputer', SimpleImputer(strategy='median')),
  54.         ('scaler', StandardScaler())])
  55.  
  56.     categorical_transformer_pipe = Pipeline(steps=[
  57.         ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
  58.         ('onehot', OneHotEncoder(handle_unknown='ignore'))])
  59.  
  60.     preprocessor_pipe = ColumnTransformer(
  61.         transformers=[
  62.             ('num', numeric_transformer_pipe, numeric_features),
  63.             ('cat', categorical_transformer_pipe, categorical_features)])
  64.     X_train = df_train.drop('target', axis=1)
  65.     y_train = df_train['target']
  66.  
  67.     X_test = df_test.drop('target', axis=1)
  68.     y_test = df_test['target']
  69.     clf = Pipeline(steps=[
  70.         ('preprocessor', preprocessor_pipe),
  71.         ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=100))])
  72.  
  73.     logger.info('Fitting model')
  74.     clf.fit(X_train, y_train)
  75.  
  76.     metrics_test = get_metrics(X_test, y_test, clf)
  77.     metrics_train = get_metrics(X_train, y_train, clf)
  78.     metrics_dict = {
  79.         'metrics_test': metrics_test,
  80.         'metrics_train': metrics_train
  81.     }
  82.  
  83.     pathlib.Path(output_file_directory).mkdir(parents=True, exist_ok=True)
  84.     with open(output_file_directory + '/metrics.json', 'w') as outfile:
  85.         json.dump(metrics_dict, outfile, indent=4)
  86.  
  87.     param_grid = {
  88.         'classifier__n_estimators': [10, 30, 100, 200],
  89.         'classifier__max_depth': [None, 10, 20, 30]
  90.     }
  91.  
  92.     logger.info('Calculating grid search')
  93.     grid_search = GridSearchCV(clf, param_grid, cv=5, iid=False, scoring='roc_auc', n_jobs=-1)
  94.     grid_search.fit(X_train, y_train)
  95.  
  96.     with open(output_file_directory + '/best_params.json', 'w') as outfile:
  97.         json.dump(metrics_dict, outfile, indent=4)
  98.  
  99.     converted_dict = dict()
  100.     for key in grid_search.cv_results_.keys():
  101.         val = grid_search.cv_results_[key]
  102.         converted_val = val
  103.         if isinstance(val, np.ndarray):
  104.             converted_val = val.tolist()
  105.         converted_dict[key] = converted_val
  106.  
  107.     with open(output_file_directory + '/cv_results.json', 'w') as outfile:
  108.         json.dump(converted_dict, outfile, indent=4)
  109.  
  110.     logger.info('Finished')
  111.  
  112.  
  113. def get_metrics(x, y, clf):
  114.     res = dict()
  115.     tlf = clf.predict(x)
  116.     res['model_accuracy'] = metrics.accuracy_score(y, tlf)
  117.     res['model_precision'] = metrics.precision_score(y, tlf)
  118.     res['model_recall'] = metrics.recall_score(y, tlf)
  119.     res['model_F1'] = metrics.f1_score(y, tlf)
  120.     res['model_AuROC'] = metrics.roc_auc_score(y, tlf)
  121.     return res
  122.  
  123.  
  124. if __name__ == '__main__':
  125.     log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  126.     logging.basicConfig(level=logging.INFO, format=log_fmt)
  127.  
  128.     # not used in this stub but often useful for finding various files
  129.     project_dir = Path(__file__).resolve().parents[2]
  130.  
  131.     # find .env automagically by walking up directories until it's found, then
  132.     # load up the .env entries as environment variables
  133.     load_dotenv(find_dotenv())
  134.  
  135.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement