Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2
- # -*- coding: utf-8 -*-
- """
- Created on Tue Jan 16 21:32:56 2018
- @author: hericsonaraujo
- """
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.metrics import accuracy_score, make_scorer
- from sklearn.model_selection import train_test_split
- # Import basic libraries
- import numpy as np # linear algebra
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
- # import visualization libraries
- import seaborn as sns
- import matplotlib.pyplot as plt
- #from ggplot import *
- df = pd.read_csv('UCI_Credit_Card.csv')
- df.sample(5)
- df = df.rename(columns={'default.payment.next.month': 'def_pay',
- 'PAY_0': 'PAY_1'})
- df.head()
- df.info()
- fil = (df.EDUCATION == 5) | (df.EDUCATION == 6) | (df.EDUCATION == 0)
- df.loc[fil, 'EDUCATION'] = 4
- df.EDUCATION.value_counts()
- df.loc[df.MARRIAGE == 0, 'MARRIAGE'] = 3
- df.MARRIAGE.value_counts()
- print (float(df.def_pay.sum())/float(len(df.def_pay)))
- fil = (df.PAY_1 == -2) | (df.PAY_1 == -1) | (df.PAY_1 == 0)
- df.loc[fil, 'PAY_1'] = 0
- fil = (df.PAY_2 == -2) | (df.PAY_2 == -1) | (df.PAY_2 == 0)
- df.loc[fil, 'PAY_2'] = 0
- fil = (df.PAY_3 == -2) | (df.PAY_3 == -1) | (df.PAY_3 == 0)
- df.loc[fil, 'PAY_3'] = 0
- fil = (df.PAY_4 == -2) | (df.PAY_4 == -1) | (df.PAY_4 == 0)
- df.loc[fil, 'PAY_4'] = 0
- fil = (df.PAY_5 == -2) | (df.PAY_5 == -1) | (df.PAY_5 == 0)
- df.loc[fil, 'PAY_5'] = 0
- fil = (df.PAY_6 == -2) | (df.PAY_6 == -1) | (df.PAY_6 == 0)
- df.loc[fil, 'PAY_6'] = 0
- late = df[['PAY_1','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
- #draw_histograms(late, late.columns, 2, 3, 10)
- y = df['def_pay'].copy()
- y.sample(5)
- features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
- 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
- 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
- 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
- X = df[features].copy()
- X.columns
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
- print(df.def_pay.describe())
- print("---------------------------")
- print(y_train.describe())
- print("---------------------------")
- print(y_test.describe())
- #create the classifier
- classifier = DecisionTreeClassifier(max_depth=10, random_state=14)
- # training the classifier
- classifier.fit(X_train, y_train)
- # do our predictions on the test
- predictions = classifier.predict(X_test)
- # see how good we did on the test
- print(accuracy_score(y_true = y_test, y_pred = predictions))
- classifier = DecisionTreeClassifier(max_depth=100, random_state=14)
- classifier.fit(X_train, y_train)
- predictions = classifier.predict(X_test)
- print(accuracy_score(y_true = y_test, y_pred = predictions))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement