Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.cross_validation import train_test_split
- from sklearn.metrics import mean_squared_error
- from sklearn.metrics import accuracy_score
- from sklearn.metrics import confusion_matrix
- from sklearn.preprocessing import StandardScaler
- from random import randint
- from sklearn.tree import export_graphviz
- from sklearn.ensemble import RandomForestRegressor
- def categorizeAge(x, split=[3,8,18,40,55]):
- ''' return categorical value of age based on 5 split values'''
- if pd.isnull(x):
- return None
- if (x>=0)and (x<split[0]):
- return 0
- elif (x>=split[0]) and (x<split[1]):
- return 1
- elif (x>=split[1]) and (x<split[2]):
- return 2
- elif (x>=split[2]) and (x<split[3]):
- return 3
- elif (x>=split[3]) and (x<split[4]):
- return 4
- else:
- return 5
- def categorizeEmbarked(x):
- if (x == "S"):
- return 0
- elif (x == "C"):
- return 1
- else:
- return 2
- def isConsecutive(A, A_fare, B, B_fare):
- '''Extracts numeric part from strings A and B and returns True if A is consecutive to B'''
- numA = ''
- numB = ''
- for char in A:
- if char.isdigit():
- numA += char
- for char in B:
- if char.isdigit():
- numB += char
- try:
- numA = int(numA)
- numB = int(numB)
- except ValueError:
- numA = 0
- numB = 0
- if numA == numB + 1 and A_fare==B_fare:
- return True
- return False
- def fixFareAndFindGroups(df):
- '''Sorts (lexically) by ticket and iterates through rows to do the following:
- 1. If a group with same ticket is found, divide corresponding fare by that group size
- 2, If a group with consecutive ticket numbers is found, assign that group size in
- a new feature called possibleGroupSize'''
- df = df.sort_values(by="ticket")
- df["possibleGroupSize"] = 0
- sameGroup = []
- consecGroup = []
- prevTicket = "?"
- prevIndex = 100000 #infinity
- for index, row in df.iterrows():
- ticketIndex = df.columns.get_loc("ticket")
- fareIndex = df.columns.get_loc("fare")
- groupIndex = df.columns.get_loc("possibleGroupSize")
- currentTicket = str(row["ticket"])
- try:
- prevTicket = df.ix[prevIndex,ticketIndex]
- except KeyError:
- prevIndex = index
- continue
- if currentTicket == prevTicket:
- if sameGroup == []:
- sameGroup.append(prevIndex)
- sameGroup.append(index)
- elif isConsecutive(currentTicket,row["fare"], prevTicket,df.ix[prevIndex,fareIndex]):
- if consecGroup == []:
- consecGroup.append(prevIndex)
- consecGroup.append(index)
- else:
- for l in sameGroup:
- df.ix[l,fareIndex] = float(df.ix[l,fareIndex]) / float(len(sameGroup))
- df.ix[l,groupIndex] = len(consecGroup)
- sameGroup = []
- consecGroup = []
- prevIndex = index
- return df
- def dropRowsWithMissingValue(df, attribute):
- df[attribute]=df[attribute].fillna("NAN")
- return df[df[attribute] != "NAN"]
- def imputeAndCategorizeAge(df):
- '''Imputes age using random forest regressor on remaining attributes
- adapted from blog: http://www.ultravioletanalytics.com/2014/11/03/kaggle-titanic-competition-part-ii-missing-values/'''
- # Grab all the features that can be included in a Random Forest Regressor
- age_df = df[['age', 'fare', 'pclass', 'sex','nameLength']]
- # Split into sets with known and unknown Age values
- knownAge = age_df.loc[ (df.age.notnull()) ]
- unknownAge = age_df.loc[ (df.age.isnull()) ]
- # All age values are stored in a target array
- y = knownAge.values[:, 0]
- # All the other values are stored in the feature array
- X = knownAge.values[:, 1::]
- # Create and fit a model
- rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
- rtr.fit(X, y)
- # Use the fitted model to predict the missing values
- predictedAges = rtr.predict(unknownAge.values[:, 1::])
- # Assign those predictions to the full data set
- df.loc[ (df.age.isnull()), 'age' ] = predictedAges
- df["categorizedAge"]=df["age"].apply(categorizeAge)
- return df
- def preProcess(df, target):
- '''Cleans, preprocess and transforms titanic3 dataset as described in
- project report. During the preprocessing, we temporarily recombine target
- so that any row deletion reflects on corresponding target as well.'''
- df = pd.concat([df, target],axis=1)
- df = dropRowsWithMissingValue(df, "fare")
- df = dropRowsWithMissingValue(df, "embarked")
- df["sex"]=df["sex"].apply(lambda x: 0 if x=="female" else 1)
- df["embarked"]=df["embarked"].apply(categorizeEmbarked)
- df["nameLength"]=df["name"].apply(len)
- df = fixFareAndFindGroups(df)
- df = imputeAndCategorizeAge(df)
- df = df.filter(['pclass', 'sex', 'sibsp', 'parch', 'fare',\
- 'embarked', 'categorizedAge', 'possibleGroupSize', 'nameLength', 'survived'],1)
- return [df.drop("survived",axis=1), df.filter(["survived"],1)]
- def runModel(testDataMatrix, trainingDataMatrix, trainingTarget, DTfileName='tree.dot'):
- '''Prediction model that returns prediction on test data based on training data'''
- classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth=6, random_state=0)
- classifier.fit(trainingDataMatrix, trainingTarget)
- export_graphviz(classifier, out_file=DTfileName,feature_names=testDataMatrix.columns)
- return classifier.predict(testDataMatrix)
- def evaluate(X_test, X_train, Y_test, Y_train):
- print "\nEvaluating model with features:", ','.join(X_train.columns.values)
- y_pred=runModel(X_test, X_train, Y_train)
- print '\nMean squared error: %.2f' % mean_squared_error(Y_test,y_pred)
- print 'Accuracy: %.2f%%' %(accuracy_score(Y_test,y_pred)*100)
- print 'Confusion matrix:\n', confusion_matrix(y_true=Y_test, y_pred=y_pred)
- #Load dataset from csv file
- titanic=pd.read_csv("titanic3.csv")
- #Split input and target
- X = titanic.drop("survived",1)
- Y = titanic.filter(["survived"],1)
- #Split training and testing sets
- X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
- '''To be fair, preprocessing is done indepedently for the test and training set.
- This is especially necessary to ensure independent imputation in test and training set.'''
- [X_train, Y_train] = preProcess(X_train, Y_train)
- [X_test, Y_test] = preProcess(X_test, Y_test)
- # Put stats (optional)
- print 'Test set characteristics:\n', X_test.describe()
- print '\nTraining set characteristics:\n', X_train.describe()
- #evaluate with no feature selection
- evaluate(X_test, X_train, Y_test, Y_train)
- #Feature selection #1 (Table 2 in report)
- print 'Feature selection #1 (Table 2 in report)'
- X_trainS = X_train.filter(["sex","fare","nameLength","categorizedAge"],1)
- X_testS = X_test.filter(["sex","fare","nameLength","categorizedAge"],1)
- evaluate(X_testS, X_trainS, Y_test, Y_train)
- #Feature selection #2 (Table 2 in report)
- print 'Feature selection #2 (Table 2 in report)'
- X_trainS = X_train.filter(["pclass","sex","possibleGroupSize"],1)
- X_testS = X_test.filter(["pclass","sex","possibleGroupSize"],1)
- evaluate(X_testS, X_trainS, Y_test, Y_train)
- #Feature selection #3 (Table 2 in report)
- print 'Feature selection #3 (Table 2 in report)'
- X_trainS = X_train.filter(["nameLength","pclass","fare","categorizedAge","sibsp","sex"],1)
- X_testS = X_test.filter(["nameLength","pclass","fare","categorizedAge","sibsp","sex"],1)
- evaluate(X_testS, X_trainS, Y_test, Y_train)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement