titanic3 dataset analysis

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from random import randint
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor

def categorizeAge(x, split=[3,8,18,40,55]):
    ''' return categorical value of age based on 5 split values'''
    if pd.isnull(x):
        return None

    if (x>=0)and (x<split[0]):
        return 0
    elif (x>=split[0]) and (x<split[1]):
        return 1
    elif (x>=split[1]) and (x<split[2]):
        return 2
    elif (x>=split[2]) and (x<split[3]):
        return 3
    elif (x>=split[3]) and (x<split[4]):
        return 4
    else:
        return 5

def categorizeEmbarked(x):
    if (x == "S"):
        return 0
    elif (x == "C"):
        return 1
    else:
        return 2

def isConsecutive(A, A_fare, B, B_fare):
    '''Extracts numeric part from strings A and B and returns True if A is consecutive to B'''

    numA = ''
    numB = ''

    for char in A:
        if char.isdigit():
            numA += char
    for char in B:
        if char.isdigit():
            numB += char

    try:
        numA = int(numA)
        numB = int(numB)
    except ValueError:
        numA = 0
        numB = 0

    if numA == numB + 1 and A_fare==B_fare:
        return True

    return False

def fixFareAndFindGroups(df):
    '''Sorts (lexically) by ticket and iterates through rows to do the following:
    1. If a group with same ticket is found, divide corresponding fare by that group size
    2, If a group with consecutive ticket numbers is found, assign that group size in
    a new feature called possibleGroupSize'''

    df = df.sort_values(by="ticket")
    df["possibleGroupSize"] = 0
    sameGroup = []
    consecGroup = []
    prevTicket = "?"
    prevIndex = 100000 #infinity
    for index, row in df.iterrows():
        ticketIndex = df.columns.get_loc("ticket")
        fareIndex = df.columns.get_loc("fare")
        groupIndex = df.columns.get_loc("possibleGroupSize")
        currentTicket = str(row["ticket"])
        try:
            prevTicket = df.ix[prevIndex,ticketIndex]
        except KeyError:
            prevIndex = index
            continue
        if currentTicket == prevTicket:
            if sameGroup == []:
                sameGroup.append(prevIndex)
            sameGroup.append(index)
        elif isConsecutive(currentTicket,row["fare"], prevTicket,df.ix[prevIndex,fareIndex]):
            if consecGroup == []:
                consecGroup.append(prevIndex)
            consecGroup.append(index)
        else:
            for l in sameGroup:
                df.ix[l,fareIndex] = float(df.ix[l,fareIndex]) / float(len(sameGroup))
                df.ix[l,groupIndex] = len(consecGroup)
            sameGroup = []
            consecGroup = []
        prevIndex = index
    return df

def dropRowsWithMissingValue(df, attribute):
    df[attribute]=df[attribute].fillna("NAN")
    return df[df[attribute] != "NAN"]

def imputeAndCategorizeAge(df):
    '''Imputes age using random forest regressor on remaining attributes
    adapted from blog: http://www.ultravioletanalytics.com/2014/11/03/kaggle-titanic-competition-part-ii-missing-values/'''

    # Grab all the features that can be included in a Random Forest Regressor
    age_df = df[['age', 'fare', 'pclass', 'sex','nameLength']]

    # Split into sets with known and unknown Age values
    knownAge = age_df.loc[ (df.age.notnull()) ]
    unknownAge = age_df.loc[ (df.age.isnull()) ]

    # All age values are stored in a target array
    y = knownAge.values[:, 0]

    # All the other values are stored in the feature array
    X = knownAge.values[:, 1::]

    # Create and fit a model
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    rtr.fit(X, y)

    # Use the fitted model to predict the missing values
    predictedAges = rtr.predict(unknownAge.values[:, 1::])

    # Assign those predictions to the full data set
    df.loc[ (df.age.isnull()), 'age' ] = predictedAges

    df["categorizedAge"]=df["age"].apply(categorizeAge)

    return df


def preProcess(df, target):
    '''Cleans, preprocess and transforms titanic3 dataset as described in
    project report. During the preprocessing, we temporarily recombine target
    so that any row deletion reflects on corresponding target as well.'''
    df = pd.concat([df, target],axis=1)
    df = dropRowsWithMissingValue(df, "fare")
    df = dropRowsWithMissingValue(df, "embarked")
    df["sex"]=df["sex"].apply(lambda x: 0 if x=="female" else 1)
    df["embarked"]=df["embarked"].apply(categorizeEmbarked)
    df["nameLength"]=df["name"].apply(len)
    df = fixFareAndFindGroups(df)
    df = imputeAndCategorizeAge(df)
    df = df.filter(['pclass', 'sex', 'sibsp', 'parch', 'fare',\
    'embarked', 'categorizedAge', 'possibleGroupSize', 'nameLength', 'survived'],1)

    return [df.drop("survived",axis=1), df.filter(["survived"],1)]

def runModel(testDataMatrix, trainingDataMatrix, trainingTarget, DTfileName='tree.dot'):
    '''Prediction model that returns prediction on test data based on training data'''
    classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth=6, random_state=0)
    classifier.fit(trainingDataMatrix, trainingTarget)
    export_graphviz(classifier, out_file=DTfileName,feature_names=testDataMatrix.columns)
    return classifier.predict(testDataMatrix)

def evaluate(X_test, X_train, Y_test, Y_train):
    print "\nEvaluating model with features:", ','.join(X_train.columns.values)
    y_pred=runModel(X_test, X_train, Y_train)
    print '\nMean squared error: %.2f' % mean_squared_error(Y_test,y_pred)
    print 'Accuracy: %.2f%%' %(accuracy_score(Y_test,y_pred)*100)
    print 'Confusion matrix:\n', confusion_matrix(y_true=Y_test, y_pred=y_pred)

#Load dataset from csv file
titanic=pd.read_csv("titanic3.csv")

#Split input and target
X = titanic.drop("survived",1)
Y = titanic.filter(["survived"],1)

#Split training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

'''To be fair, preprocessing is done indepedently for the test and training set.
This is especially necessary to ensure independent imputation in test and training set.'''
[X_train, Y_train] = preProcess(X_train, Y_train)
[X_test, Y_test] = preProcess(X_test, Y_test)

# Put stats (optional)
print 'Test set characteristics:\n', X_test.describe()
print '\nTraining set characteristics:\n', X_train.describe()

#evaluate with no feature selection
evaluate(X_test, X_train, Y_test, Y_train)


#Feature selection #1 (Table 2 in report)
print 'Feature selection #1 (Table 2 in report)'
X_trainS = X_train.filter(["sex","fare","nameLength","categorizedAge"],1)
X_testS = X_test.filter(["sex","fare","nameLength","categorizedAge"],1)
evaluate(X_testS, X_trainS, Y_test, Y_train)


#Feature selection #2 (Table 2 in report)
print 'Feature selection #2 (Table 2 in report)'
X_trainS = X_train.filter(["pclass","sex","possibleGroupSize"],1)
X_testS = X_test.filter(["pclass","sex","possibleGroupSize"],1)
evaluate(X_testS, X_trainS, Y_test, Y_train)

#Feature selection #3 (Table 2 in report)
print 'Feature selection #3 (Table 2 in report)'
X_trainS = X_train.filter(["nameLength","pclass","fare","categorizedAge","sibsp","sex"],1)
X_testS = X_test.filter(["nameLength","pclass","fare","categorizedAge","sibsp","sex"],1)
evaluate(X_testS, X_trainS, Y_test, Y_train)