Advertisement
ayush3504

titanic3 dataset analysis

Mar 4th, 2016
156
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.75 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.tree import DecisionTreeClassifier
  3. from sklearn.cross_validation import train_test_split
  4. from sklearn.metrics import mean_squared_error
  5. from sklearn.metrics import accuracy_score
  6. from sklearn.metrics import confusion_matrix
  7. from sklearn.preprocessing import StandardScaler
  8. from random import randint
  9. from sklearn.tree import export_graphviz
  10. from sklearn.ensemble import RandomForestRegressor
  11.  
  12. def categorizeAge(x, split=[3,8,18,40,55]):
  13.     ''' return categorical value of age based on 5 split values'''
  14.     if pd.isnull(x):
  15.         return None
  16.    
  17.     if (x>=0)and (x<split[0]):
  18.         return 0
  19.     elif (x>=split[0]) and (x<split[1]):
  20.         return 1
  21.     elif (x>=split[1]) and (x<split[2]):
  22.         return 2
  23.     elif (x>=split[2]) and (x<split[3]):
  24.         return 3
  25.     elif (x>=split[3]) and (x<split[4]):
  26.         return 4
  27.     else:
  28.         return 5
  29.        
  30. def categorizeEmbarked(x):
  31.     if (x == "S"):
  32.         return 0
  33.     elif (x == "C"):
  34.         return 1
  35.     else:
  36.         return 2
  37.  
  38. def isConsecutive(A, A_fare, B, B_fare):
  39.     '''Extracts numeric part from strings A and B and returns True if A is consecutive to B'''
  40.  
  41.     numA = ''
  42.     numB = ''
  43.    
  44.     for char in A:
  45.         if char.isdigit():
  46.             numA += char
  47.     for char in B:
  48.         if char.isdigit():
  49.             numB += char
  50.            
  51.     try:
  52.         numA = int(numA)
  53.         numB = int(numB)
  54.     except ValueError:
  55.         numA = 0
  56.         numB = 0
  57.        
  58.     if numA == numB + 1 and A_fare==B_fare:
  59.         return True
  60.  
  61.     return False
  62.    
  63. def fixFareAndFindGroups(df):
  64.     '''Sorts (lexically) by ticket and iterates through rows to do the following:
  65.    1. If a group with same ticket is found, divide corresponding fare by that group size
  66.    2, If a group with consecutive ticket numbers is found, assign that group size in
  67.    a new feature called possibleGroupSize'''
  68.  
  69.     df = df.sort_values(by="ticket")
  70.     df["possibleGroupSize"] = 0
  71.     sameGroup = []
  72.     consecGroup = []
  73.     prevTicket = "?"
  74.     prevIndex = 100000 #infinity
  75.     for index, row in df.iterrows():
  76.         ticketIndex = df.columns.get_loc("ticket")
  77.         fareIndex = df.columns.get_loc("fare")
  78.         groupIndex = df.columns.get_loc("possibleGroupSize")
  79.         currentTicket = str(row["ticket"])
  80.         try:        
  81.             prevTicket = df.ix[prevIndex,ticketIndex]
  82.         except KeyError:
  83.             prevIndex = index
  84.             continue
  85.         if currentTicket == prevTicket:
  86.             if sameGroup == []:
  87.                 sameGroup.append(prevIndex)
  88.             sameGroup.append(index)
  89.         elif isConsecutive(currentTicket,row["fare"], prevTicket,df.ix[prevIndex,fareIndex]):
  90.             if consecGroup == []:
  91.                 consecGroup.append(prevIndex)
  92.             consecGroup.append(index)                
  93.         else:
  94.             for l in sameGroup:
  95.                 df.ix[l,fareIndex] = float(df.ix[l,fareIndex]) / float(len(sameGroup))
  96.                 df.ix[l,groupIndex] = len(consecGroup)
  97.             sameGroup = []
  98.             consecGroup = []
  99.         prevIndex = index
  100.     return df
  101.  
  102. def dropRowsWithMissingValue(df, attribute):
  103.     df[attribute]=df[attribute].fillna("NAN")
  104.     return df[df[attribute] != "NAN"]
  105.  
  106. def imputeAndCategorizeAge(df):
  107.     '''Imputes age using random forest regressor on remaining attributes
  108.    adapted from blog: http://www.ultravioletanalytics.com/2014/11/03/kaggle-titanic-competition-part-ii-missing-values/'''
  109.        
  110.     # Grab all the features that can be included in a Random Forest Regressor
  111.     age_df = df[['age', 'fare', 'pclass', 'sex','nameLength']]
  112.  
  113.     # Split into sets with known and unknown Age values
  114.     knownAge = age_df.loc[ (df.age.notnull()) ]
  115.     unknownAge = age_df.loc[ (df.age.isnull()) ]
  116.  
  117.     # All age values are stored in a target array
  118.     y = knownAge.values[:, 0]
  119.  
  120.     # All the other values are stored in the feature array
  121.     X = knownAge.values[:, 1::]
  122.  
  123.     # Create and fit a model
  124.     rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
  125.     rtr.fit(X, y)
  126.  
  127.     # Use the fitted model to predict the missing values
  128.     predictedAges = rtr.predict(unknownAge.values[:, 1::])
  129.  
  130.     # Assign those predictions to the full data set
  131.     df.loc[ (df.age.isnull()), 'age' ] = predictedAges
  132.  
  133.     df["categorizedAge"]=df["age"].apply(categorizeAge)
  134.  
  135.     return df
  136.    
  137.  
  138. def preProcess(df, target):    
  139.     '''Cleans, preprocess and transforms titanic3 dataset as described in
  140.    project report. During the preprocessing, we temporarily recombine target
  141.    so that any row deletion reflects on corresponding target as well.'''
  142.     df = pd.concat([df, target],axis=1)
  143.     df = dropRowsWithMissingValue(df, "fare")
  144.     df = dropRowsWithMissingValue(df, "embarked")
  145.     df["sex"]=df["sex"].apply(lambda x: 0 if x=="female" else 1)        
  146.     df["embarked"]=df["embarked"].apply(categorizeEmbarked)
  147.     df["nameLength"]=df["name"].apply(len)        
  148.     df = fixFareAndFindGroups(df)
  149.     df = imputeAndCategorizeAge(df)
  150.     df = df.filter(['pclass', 'sex', 'sibsp', 'parch', 'fare',\
  151.     'embarked', 'categorizedAge', 'possibleGroupSize', 'nameLength', 'survived'],1)
  152.  
  153.     return [df.drop("survived",axis=1), df.filter(["survived"],1)]
  154.  
  155. def runModel(testDataMatrix, trainingDataMatrix, trainingTarget, DTfileName='tree.dot'):
  156.     '''Prediction model that returns prediction on test data based on training data'''    
  157.     classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth=6, random_state=0)
  158.     classifier.fit(trainingDataMatrix, trainingTarget)
  159.     export_graphviz(classifier, out_file=DTfileName,feature_names=testDataMatrix.columns)
  160.     return classifier.predict(testDataMatrix)
  161.  
  162. def evaluate(X_test, X_train, Y_test, Y_train):
  163.     print "\nEvaluating model with features:", ','.join(X_train.columns.values)
  164.     y_pred=runModel(X_test, X_train, Y_train)
  165.     print '\nMean squared error: %.2f' % mean_squared_error(Y_test,y_pred)
  166.     print 'Accuracy: %.2f%%' %(accuracy_score(Y_test,y_pred)*100)
  167.     print 'Confusion matrix:\n', confusion_matrix(y_true=Y_test, y_pred=y_pred)
  168.  
  169. #Load dataset from csv file
  170. titanic=pd.read_csv("titanic3.csv")
  171.  
  172. #Split input and target
  173. X = titanic.drop("survived",1)
  174. Y = titanic.filter(["survived"],1)
  175.  
  176. #Split training and testing sets
  177. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
  178.  
  179. '''To be fair, preprocessing is done indepedently for the test and training set.
  180. This is especially necessary to ensure independent imputation in test and training set.'''
  181. [X_train, Y_train] = preProcess(X_train, Y_train)
  182. [X_test, Y_test] = preProcess(X_test, Y_test)
  183.  
  184. # Put stats (optional)
  185. print 'Test set characteristics:\n', X_test.describe()
  186. print '\nTraining set characteristics:\n', X_train.describe()
  187.  
  188. #evaluate with no feature selection
  189. evaluate(X_test, X_train, Y_test, Y_train)
  190.  
  191.  
  192. #Feature selection #1 (Table 2 in report)
  193. print 'Feature selection #1 (Table 2 in report)'
  194. X_trainS = X_train.filter(["sex","fare","nameLength","categorizedAge"],1)
  195. X_testS = X_test.filter(["sex","fare","nameLength","categorizedAge"],1)
  196. evaluate(X_testS, X_trainS, Y_test, Y_train)
  197.  
  198.  
  199. #Feature selection #2 (Table 2 in report)
  200. print 'Feature selection #2 (Table 2 in report)'
  201. X_trainS = X_train.filter(["pclass","sex","possibleGroupSize"],1)
  202. X_testS = X_test.filter(["pclass","sex","possibleGroupSize"],1)
  203. evaluate(X_testS, X_trainS, Y_test, Y_train)
  204.  
  205. #Feature selection #3 (Table 2 in report)
  206. print 'Feature selection #3 (Table 2 in report)'
  207. X_trainS = X_train.filter(["nameLength","pclass","fare","categorizedAge","sibsp","sex"],1)
  208. X_testS = X_test.filter(["nameLength","pclass","fare","categorizedAge","sibsp","sex"],1)
  209. evaluate(X_testS, X_trainS, Y_test, Y_train)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement