Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- Overview
- The data has been split into two groups:
- 1) training set (train.csv) 891 Rows
- 2) test set (test.csv) 418 Rows
- Contents:
- 1)Import Necessary Libraries
- 2)Read In and Explore the Historic Data
- 3)Data Analysis
- 4)Data Visualization
- 5)Cleaning Data
- 6)Choosing the Best Model
- 7)Creating Submission File
- '''
- #1) Import Necessary Libraries
- #First off, we need to import several Python libraries such as numpy, pandas,
- # matplotlib and seaborn.
- #data analysis libraries
- import numpy as np
- import pandas as pd
- pd.set_option('display.width', 1000)
- pd.set_option('display.max_column', 16)
- pd.set_option('precision', 2)
- #visualization libraries
- import matplotlib.pyplot as plt
- import seaborn as sbn
- #ignore warnings
- import warnings
- warnings.filterwarnings('ignore')
- #STEP-2) Read in and Explore the Data
- #*********************************************
- #It's time to read in our training and testing data using pd.read_csv, and take a first look at the training data using the describe() function.
- #import train and test CSV files
- train = pd.read_csv('train.csv') #12 columns
- test = pd.read_csv('test.csv') #11 columns
- #take a look at the training data
- print("A look at the training data : \n", train.describe() )
- print( "\n" )
- print( train.describe(include="all") )
- print( "\n" )
- #STEP-3) Data Analysis
- #**************************************************
- #We're going to consider the features in the
- # dataset and how complete they are.
- #get a list of the features within the dataset
- print( "\n\n" , train.columns )
- #OUTPUT
- #Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
- # 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
- # dtype='object')
- #see a sample of the dataset to get an idea of the variables
- print()
- print( train.head() )
- print()
- print( train.sample(5) )
- #Observations from above output
- #-----------------------------
- #Numerical Features: Age (Continuous), Fare (Continuous), SibSp (Discrete), Parch (Discrete)
- #Categorical Features: Survived, Sex, Embarked, Pclass
- #Alphanumeric Features: Name, Ticket, Cabin
- print( "Data types for each feature : -" )
- print( train.dtypes )
- #Now that we have an idea of what kinds of features we're working with,
- # we can see how much information we have about each of them.
- #see a summary of the training dataset
- print( train.describe(include = "all") )
- #Some Observations from above output
- #------------------------------------
- #1)There are a total of 891 passengers in our training set.
- #2)The Age feature is missing approximately 19.8% of its values.
- # Hence Age feature is pretty important to survival,
- # so we should probably attempt to fill these gaps.
- #3)The Cabin feature is missing approximately 77.1% of its values.
- # Since so much of the feature is missing, it would be hard to fill in the missing values.
- # We'll probably drop these values from our dataset.
- #4)The Embarked feature is missing only 2 passeners,
- # which should be relatively harmless.
- #check for any other unusable values
- print()
- print( pd.isnull(train).sum() )
- #We can see that except for the above mentioned missing values,
- # no NaN values exist.
- #Relationship between Features and Survival
- #In this section, we analyze relationship between different features
- # with respect to Survival. We see how different feature values
- # show different survival chance. We also plot different kinds of
- # diagrams to visualize our data and findings.
- #4) Data Visualization
- #*************************************
- #It's time to visualize our data so we can estimate few predictions
- #-----------------
- #4.A) Sex Feature
- #-----------------
- #draw a bar plot of survival by sex
- sbn.barplot(x="Sex", y="Survived", data=train)
- plt.show()
- print(" percentages of females vs. males that survive")
- print( "Percentage of females who survived:", train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)[1]*100 )
- print( "------------------\n\n" )
- print( train )
- print( "------------------\n\n" )
- print( train["Survived"] )
- print( "------------------\n\n" )
- print( train["Sex"] == 'female' )
- print( "**********\n\n" )
- print( train["Survived"][ train["Sex"] == 'female' ] )
- print( "*****************\n\n" )
- print(train["Survived"][train["Sex"] == 'female'].value_counts() )
- print( "====================================\n\n" )
- print( train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True) )
- print( train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)[1] )
- print( "Percentage of females who survived:", train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)[1]*100 )
- print( "Percentage of males who survived:", train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)[1]*100 )
- #Percentage of females who survived: 74.2038216561
- #Percentage of males who survived: 18.8908145581
- #Some Observations from above output
- #------------------------------------
- # As predicted, females have a much higher chance of survival than males.
- # The Sex feature is essential in our predictions.
- #--------------------
- #4.B) Pclass Feature
- #--------------------
- #draw a bar plot of survival by Pclass
- sbn.barplot(x="Pclass", y="Survived", data=train)
- plt.show()
- #print( percentage of people by Pclass that survived
- print("Percentage of Pclass = 1 who survived:", train["Survived"][train["Pclass"] == 1].value_counts(normalize = True)[1]*100)
- print("Percentage of Pclass = 2 who survived:", train["Survived"][train["Pclass"] == 2].value_counts(normalize = True)[1]*100)
- print("Percentage of Pclass = 3 who survived:", train["Survived"][train["Pclass"] == 3].value_counts(normalize = True)[1]*100)
- #Percentage of Pclass = 1 who survived: 62.962962963
- #Percentage of Pclass = 2 who survived: 47.2826086957
- #Percentage of Pclass = 3 who survived: 24.2362525458
- print()
- print( "Percentage of Pclass = 1 who survived:\n\n", train["Survived"][train["Pclass"] == 1].value_counts() )
- print()
- print( "Percentage of Pclass = 1 who survived:\n\n", train["Survived"][train["Pclass"] == 1].value_counts(normalize = True) )
- print()
- print( "Percentage of Pclass = 1 who survived:\n\n", train["Survived"][train["Pclass"] == 1].value_counts(normalize = True)[1] )
- #Some Observations from above output
- #------------------------------------
- #As predicted, people with higher socioeconomic class had a higher rate of survival. (62.9% vs. 47.3% vs. 24.2%)
- #----------------------
- #4.C) SibSp Feature
- #----------------------
- #draw a bar plot for SibSp vs. survival
- sbn.barplot(x="SibSp", y="Survived", data=train)
- #I won't be printing individual percent values for all of these.
- print("Percentage of SibSp = 0 who survived:",
- train["Survived"][train["SibSp"] == 0].value_counts(normalize = True)[1]*100)
- print("Percentage of SibSp = 1 who survived:",
- train["Survived"][train["SibSp"] == 1].value_counts(normalize = True)[1]*100)
- print("Percentage of SibSp = 2 who survived:",
- train["Survived"][train["SibSp"] == 2].value_counts(normalize = True)[1]*100)
- #OUTPUT:-
- #Percentage of SibSp = 0 who survived: 34.5394736842
- #Percentage of SibSp = 1 who survived: 53.5885167464
- #Percentage of SibSp = 2 who survived: 46.4285714286
- plt.show()
- #Some Observations from above output
- #------------------------------------
- #In general, its clear that people with more siblings or
- # spouses aboard were less likely to survive.
- # However, contrary to expectations, people with no siblings
- # or spouses were less to likely to survive than those with one or two. (34.5% vs 53.4% vs. 46.4%)
- #--------------------
- #4.D)Parch Feature
- #--------------------
- #draw a bar plot for Parch vs. survival
- sbn.barplot(x="Parch", y="Survived", data=train)
- plt.show()
- #Some Observations from above output
- #------------------------------------
- #People with less than four parents or children aboard are more likely to survive than those with four or more.
- # Again, people traveling alone are less likely to survive than those with 1-3 parents or children.
- #-----------------
- #4.E)Age Feature
- #-----------------
- #sort the ages into logical categories
- train["Age"] = train["Age"].fillna(-0.5)
- test["Age"] = test["Age"].fillna(-0.5)
- bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
- labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
- train['AgeGroup'] = pd.cut(train["Age"], bins, labels = labels)
- test['AgeGroup'] = pd.cut(test["Age"], bins, labels = labels)
- print( train )
- #draw a bar plot of Age vs. survival
- sbn.barplot(x="AgeGroup", y="Survived", data=train)
- plt.show()
- #Done********************************************************
- #Some Observations from above output
- #------------------------------------
- #Babies are more likely to survive than any other age group.
- #--------------------
- #4.F) Cabin Feature
- #--------------------
- #I think the idea here is that people with recorded cabin numbers are of higher socioeconomic class,
- # and thus more likely to survive.
- train["CabinBool"] = (train["Cabin"].notnull().astype('int'))
- test["CabinBool"] = (test["Cabin"].notnull().astype('int'))
- print( "###################################\n\n" )
- print( train )
- #calculate percentages of CabinBool vs. survived
- print("Percentage of CabinBool = 1 who survived:",
- train["Survived"][train["CabinBool"] == 1].value_counts(
- normalize = True)[1]*100)
- print("Percentage of CabinBool = 0 who survived:",
- train["Survived"][train["CabinBool"] == 0].value_counts(
- normalize = True)[1]*100)
- #draw a bar plot of CabinBool vs. survival
- sbn.barplot(x="CabinBool", y="Survived", data=train)
- plt.show()
- #OUTPUT :-
- #Percentage of CabinBool = 1 who survived: 66.6666666667
- #Percentage of CabinBool = 0 who survived: 29.9854439592
- #Some Observations from above output
- #------------------------------------
- #People with a recorded Cabin number are, in fact,
- #more likely to survive. (66.6% vs 29.9%)
- #5) Cleaning Data
- #*********************************
- #Time to clean our data to account for missing values and unnecessary information!
- #Looking at the Test Data
- #Let's see how our test data looks!
- print( test.describe(include="all") )
- #Some Observations from above output for test.csv data
- #----------------------------------------------------
- #1) We have a total of 418 passengers.
- #2) 1 value from the Fare feature is missing.
- #3) Around 20.5% of the Age feature is missing in training file
- # we will need to fill that in.
- #Cabin Feature
- #we'll start off by dropping the Cabin feature since not a lot more useful information can be extracted from it.
- train = train.drop(['Cabin'], axis = 1)
- test = test.drop( ['Cabin'], axis = 1)
- #Ticket Feature
- #we can also drop the Ticket feature since it's unlikely to yield any useful information
- train = train.drop(['Ticket'], axis = 1)
- test = test.drop(['Ticket'], axis = 1)
- #Embarked Feature
- #now we need to fill in the missing values in the Embarked feature
- print( "Number of people embarking in Southampton (S):" , train[train["Embarked"]=="S"] )
- print( "\n\nSHAPE = " , train[train["Embarked"] == "S"].shape )
- print( "SHAPE[0] = " , train[train["Embarked"] == "S"].shape[0] )
- southampton = train[train["Embarked"] == "S"].shape[0]
- print( southampton )
- print( "Number of people embarking in Cherbourg (C):" , )
- cherbourg = train[train["Embarked"] == "C"].shape[0]
- print( cherbourg )
- print( "Number of people embarking in Queenstown (Q):" , )
- queenstown = train[train["Embarked"] == "Q"].shape[0]
- print( queenstown )
- #It's clear that the majority of people embarked in Southampton (S).
- # Let's go ahead and fill in the missing values with S.
- #replacing the missing values in the Embarked feature with S
- train = train.fillna({"Embarked": "S"})
- #Age Feature
- #Next we'll fill in the missing values in the Age feature.
- # Since a higher percentage of values are missing,
- # it would be illogical to fill all of them with the same value (as we did with Embarked).
- # Instead, let's try to find a way to predict the missing ages.
- #create a combined group of both datasets
- combine = [train, test]
- print( "combined data : \n",combine[0] )
- #extract a title for each Name in the train and test datasets
- for dataset in combine:
- dataset['Title'] = dataset['Name'].str.extract(', ([A-Za-z]+)\.', expand=False)
- print( "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n" )
- print( train )
- print()
- # crosstab function builds a cross-tabulation table that can show the frequency with which certain groups of data appear.
- print( pd.crosstab(train['Title'], train['Sex'] ) )
- # replace various titles with more common names
- for dataset in combine:
- dataset['Title'] = dataset['Title'].replace(
- ['Lady', 'Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'],
- 'Rare')
- dataset['Title'] = dataset['Title'].replace(['Countess', 'Sir'], 'Royal')
- dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
- dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
- dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
- print( "\n\nAfter grouping rare title : \n" , train )
- print( train[['Title', 'Survived']].groupby(['Title'],
- as_index=True).count() )
- print( "\nMap each of the title groups to a numerical value." )
- title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Royal": 5, "Rare": 6}
- for dataset in combine:
- dataset['Title'] = dataset['Title'].map(title_mapping)
- dataset['Title'] = dataset['Title'].fillna(0)
- print( "\n\nAfter replacing title with neumeric values.\n" )
- print( train )
- #NOTICE the values of last newly added column 'Title'
- #Next, we'll try to predict the missing Age values from the most common age for their Title.
- # fill missing age with mode age group for each title
- mr_age = train[train["Title"] == 1]["AgeGroup"].mode() # Mr.= Young Adult.... #jinka title 1 hai unke age group ka mod
- print( "mode() of mr_age : ", mr_age )
- print( "\n\n" )
- miss_age = train[train["Title"] == 2]["AgeGroup"].mode() #Miss.= Student
- print( "mode() of miss_age : ", miss_age )
- print( "\n\n" )
- mrs_age = train[train["Title"] == 3]["AgeGroup"].mode() #Mrs.= Adult
- print( "mode() of mrs_age : ", mrs_age )
- print( "\n\n" )
- master_age = train[train["Title"] == 4]["AgeGroup"].mode() # Baby
- print( "mode() of master_age : ", master_age )
- print( "\n\n" )
- royal_age = train[train["Title"] == 5]["AgeGroup"].mode() # Adult
- print( "mode() of royal_age : ", royal_age )
- print( "\n\n" )
- rare_age = train[train["Title"] == 6]["AgeGroup"].mode() # Adult
- print( "mode() of rare_age : ", rare_age )
- print( "\n\n**************************************************\n\n" )
- print( train.describe(include="all") )
- print( train )
- print( "\n\n******** train[AgeGroup][0] : \n\n" )
- for x in range(10) :
- print( train["AgeGroup"][x] )
- age_title_mapping = {1: "Young Adult", 2: "Student",
- 3: "Adult", 4: "Baby", 5: "Adult", 6: "Adult"}
- for x in range(len(train["AgeGroup"])):
- if train["AgeGroup"][x] == "Unknown": # x=5 ( means for 6th record )
- train["AgeGroup"][x] = age_title_mapping[ train["Title"][x] ]
- for x in range(len(test["AgeGroup"])):
- if test["AgeGroup"][x] == "Unknown":
- test["AgeGroup"][x] = age_title_mapping[test["Title"][x]]
- print( "\n\nAfter replacing Unknown values from AgeGroup column : \n" )
- print( train )
- #Now that we've filled in the missing values at least somewhat accurately,
- # it is time to map each age group to a numerical value.
- # map each Age value to a numerical value
- age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3,
- 'Student': 4, 'Young Adult': 5,
- 'Adult': 6, 'Senior': 7}
- train['AgeGroup'] = train['AgeGroup'].map(age_mapping)
- test['AgeGroup'] = test['AgeGroup'].map(age_mapping)
- print()
- print( train )
- # dropping the Age feature for now, might change
- train = train.drop(['Age'], axis=1)
- test = test.drop(['Age'], axis=1)
- print( "\n\nAge column droped." )
- print( train )
- #Name Feature
- #We can drop the name feature now that we've extracted the titles.
- #drop the name feature since it contains no more useful information.
- train = train.drop(['Name'], axis = 1)
- test = test.drop(['Name'], axis = 1)
- #Sex Feature
- #map each Sex value to a numerical value
- sex_mapping = {"male": 0, "female": 1}
- train['Sex'] = train['Sex'].map(sex_mapping)
- test['Sex'] = test['Sex'].map(sex_mapping)
- print( train )
- #Embarked Feature
- #map each Embarked value to a numerical value
- embarked_mapping = {"S": 1, "C": 2, "Q": 3}
- train['Embarked'] = train['Embarked'].map(embarked_mapping)
- test['Embarked'] = test['Embarked'].map(embarked_mapping)
- print()
- print( train.head() )
- #Fare Feature
- #It is time separate the fare values into some logical groups as well as
- # filling in the single missing value in the test dataset.
- #fill in missing Fare value in test set based on mean fare for that Pclass
- for x in range(len(test["Fare"])):
- if pd.isnull(test["Fare"][x]):
- pclass = test["Pclass"][x] #Pclass = 3
- test["Fare"][x] = round(train[ train["Pclass"] == pclass ]["Fare"].mean(), 2)
- #map Fare values into groups of numerical values
- train['FareBand'] = pd.qcut(train['Fare'], 4,
- labels = [1, 2, 3, 4])
- test['FareBand'] = pd.qcut(test['Fare'], 4,
- labels = [1, 2, 3, 4])
- #drop Fare values
- train = train.drop(['Fare'], axis = 1)
- test = test.drop(['Fare'], axis = 1)
- #check train data
- print( "\n\nFare column droped\n" )
- print( train )
- #check test data
- print()
- print( test.head() )
- #****************************************
- #6) Choosing the Best Model
- #****************************************
- #Splitting the Training Data
- #We will use part of our training data (20% in this case) to test the accuracy of our different models.
- from sklearn.model_selection import train_test_split
- input_predictors = train.drop(['Survived', 'PassengerId'], axis=1)
- ouptut_target = train["Survived"]
- x_train, x_val, y_train, y_val=train_test_split(
- input_predictors, ouptut_target, test_size = 0.20, random_state = 7)
- #Testing Different Models
- #I will be testing the following models with my training data (got the list from here):
- #1) Logistic Regression
- #2) Gaussian Naive Bayes
- #3) Support Vector Machines
- #4) Linear SVC
- #5) Perceptron
- #6) Decision Tree Classifier
- #7) Random Forest Classifier
- #8) KNN or k-Nearest Neighbors
- #9) Stochastic Gradient Descent
- #10) Gradient Boosting Classifier
- #For each model, we set the model, fit it with 80% of our training data,
- # predict for 20% of the training data and check the accuracy.
- from sklearn.metrics import accuracy_score
- #MODEL-1) LogisticRegression
- #------------------------------------------
- from sklearn.linear_model import LogisticRegression
- logreg = LogisticRegression()
- logreg.fit(x_train, y_train)
- y_pred = logreg.predict(x_val)
- acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-1: Accuracy of LogisticRegression : ", acc_logreg )
- #OUTPUT:-
- #MODEL-1: Accuracy of LogisticRegression : 77.09
- #MODEL-2) Gaussian Naive Bayes
- #------------------------------------------
- from sklearn.naive_bayes import GaussianNB
- gaussian = GaussianNB()
- gaussian.fit(x_train, y_train)
- y_pred = gaussian.predict(x_val)
- acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-2: Accuracy of GaussianNB : ", acc_gaussian )
- #OUTPUT:-
- #MODEL-2: Accuracy of GaussianNB : 78.68
- #MODEL-3) Support Vector Machines
- #------------------------------------------
- from sklearn.svm import SVC
- svc = SVC()
- svc.fit(x_train, y_train)
- y_pred = svc.predict(x_val)
- acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-3: Accuracy of Support Vector Machines : ", acc_svc )
- #OUTPUT:-
- #MODEL-3: Accuracy of Support Vector Machines : 82.74
- #MODEL-4) Linear SVC
- #------------------------------------------
- from sklearn.svm import LinearSVC
- linear_svc = LinearSVC()
- linear_svc.fit(x_train, y_train)
- y_pred = linear_svc.predict(x_val)
- acc_linear_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-4: Accuracy of LinearSVC : ",acc_linear_svc )
- #OUTPUT:-
- #MODEL-4: Accuracy of LinearSVC : 78.68
- #MODEL-5) Perceptron
- #------------------------------------------
- from sklearn.linear_model import Perceptron
- perceptron = Perceptron()
- perceptron.fit(x_train, y_train)
- y_pred = perceptron.predict(x_val)
- acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-5: Accuracy of Perceptron : ",acc_perceptron )
- #OUTPUT:-
- #MODEL-5: Accuracy of Perceptron : 79.19
- #MODEL-6) Decision Tree Classifier
- #------------------------------------------
- from sklearn.tree import DecisionTreeClassifier
- decisiontree = DecisionTreeClassifier()
- decisiontree.fit(x_train, y_train)
- y_pred = decisiontree.predict(x_val)
- acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-6: Accuracy of DecisionTreeClassifier : ", acc_decisiontree )
- #OUTPUT:-
- #MODEL-6: Accuracy of DecisionTreeClassifier : 81.22
- #MODEL-7) Random Forest
- #------------------------------------------
- from sklearn.ensemble import RandomForestClassifier
- randomforest = RandomForestClassifier()
- randomforest.fit(x_train, y_train)
- y_pred = randomforest.predict(x_val)
- acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-7: Accuracy of RandomForestClassifier : ",acc_randomforest )
- #OUTPUT:-
- #MODEL-7: Accuracy of RandomForestClassifier : 83.25
- #MODEL-8) KNN or k-Nearest Neighbors
- #------------------------------------------
- from sklearn.neighbors import KNeighborsClassifier
- knn = KNeighborsClassifier()
- knn.fit(x_train, y_train)
- y_pred = knn.predict(x_val)
- acc_knn = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-8: Accuracy of k-Nearest Neighbors : ",acc_knn )
- #OUTPUT:-
- #MODEL-8: Accuracy of k-Nearest Neighbors : 77.66
- #MODEL-9) Stochastic Gradient Descent
- #------------------------------------------
- from sklearn.linear_model import SGDClassifier
- sgd = SGDClassifier()
- sgd.fit(x_train, y_train)
- y_pred = sgd.predict(x_val)
- acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-9: Accuracy of Stochastic Gradient Descent : ",acc_sgd )
- #OUTPUT:-
- #MODEL-9: Accuracy of Stochastic Gradient Descent : 71.07
- #MODEL-10) Gradient Boosting Classifier
- #------------------------------------------
- from sklearn.ensemble import GradientBoostingClassifier
- gbk = GradientBoostingClassifier()
- gbk.fit(x_train, y_train)
- y_pred = gbk.predict(x_val)
- acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)
- print( "MODEL-10: Accuracy of GradientBoostingClassifier : ",acc_gbk )
- #OUTPUT:-
- #MODEL-10: Accuracy of Stochastic Gradient Descent : 84.77
- #Let's compare the accuracies of each model!
- models = pd.DataFrame({
- 'Model': ['Logistic Regression','Gaussian Naive Bayes','Support Vector Machines',
- 'Linear SVC', 'Perceptron', 'Decision Tree',
- 'Random Forest', 'KNN','Stochastic Gradient Descent',
- 'Gradient Boosting Classifier'],
- 'Score': [acc_logreg, acc_gaussian, acc_svc,
- acc_linear_svc, acc_perceptron, acc_decisiontree,
- acc_randomforest, acc_knn, acc_sgd, acc_gbk]
- })
- print()
- print( models.sort_values(by='Score', ascending=False) )
- #According to above reporting, I decided to use the Random Forest model for the testing data.
- #7) Creating Submission Result File
- #***********************************
- #It is time to create a submission.csv file which includes our predictions for test data
- #set ids as PassengerId and predict survival
- ids = test['PassengerId']
- predictions = randomforest.predict(test.drop('PassengerId', axis=1))
- #set the output as a dataframe and convert to csv file named submission.csv
- output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
- output.to_csv('submission.csv', index=False)
- print( "All survival predictions done." )
- print( "All predictions exported to submission.csv file." )
- print( "output : \n",output )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement