Advertisement
Radeen10-_

Final R statistical analysis for Titanic Dataset

Jun 6th, 2020 (edited)
205
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 5.71 KB | None | 0 0
  1. #read the csv file
  2. titanic<-read.csv('D:/arik.csv',header = TRUE)
  3. #define a new variable of complete cases of the file
  4. x<-titanic[complete.cases(titanic),]
  5. #build a data frame
  6. df<-data.frame(x$Age,x$Fare)
  7. #assign age and fare of the ticket in p and q
  8. p<-x$Age
  9. q<-x$Fare
  10. #create t test
  11. help("t.test")
  12. ?t.test
  13. #create a boxplot
  14. boxplot(p,q)
  15. #perform two sided t test
  16. t.test(p,q,mu=0,alt='two.sided',conf.level=0.90,var.equal=FALSE)
  17.  
  18. #again read the csv file with a new variable
  19. titanic1<-read.csv('D:/arik.csv',header=TRUE)
  20. #remove all NA using complete.cases
  21. x1<-titanic1[complete.cases(titanic1),]
  22. #create a new data frame
  23. df1<-data.frame(x1)
  24. #create a new subset where stored who was survived
  25. total_survival<-subset(x1,Survived==1)
  26. #indentify total survival in the titanic ship
  27. nrow(total_survival)
  28. #view the survival data identifying male and female
  29. View(total_survival$Sex)
  30.  
  31. #how many passengers survived on that in incident
  32. total_passengers<-nrow(x1)
  33. percentage_of_total_survival<-(nrow(total_survival)/total_passengers)*100
  34.  
  35. #how many children survived on that incident
  36. #buid a new subset
  37. #consider under 18 everyone is child
  38. child<-subset(total_survival,Age<18)
  39. #assign number of total survival and among them total children
  40. #into two diffrent variables k and m
  41. k<-nrow(total_survival)
  42. m<-nrow(child)
  43. #view the total child servival percentage
  44. total_childsurvival_percentage<-(m/k)*100
  45.  
  46.  
  47. #do multiple linear regression to see the relation between the explanatory variable
  48. #and all the independent variables
  49. #here we denote survival data as a dependent variable and ticket fare,age,passenger class
  50. #as the independent variables
  51.  
  52.  
  53. help(lm)
  54. model<-lm(Survived~Age+Fare+Pclass,data = df1)
  55. #get a summary of the model
  56. summary(model)
  57. #plot the regression fit model of the summary model
  58. plot(model)
  59.  
  60. #now we build a anova table
  61. anova<-aov(Survived~Age+Fare+Pclass,data = df1)
  62. summary(anova)
  63.  
  64. #now we see the confidence interval
  65. confint(model,level = 0.95)
  66.  
  67.  
  68. #we do correlation to see the relation between two models and also see if this is happen any multicolinearity
  69. #we denote c1 as column with age
  70. #we denote d1 as column with fare of the ticket
  71. c1<-df1$Age
  72. d1<-df1$Fare
  73. cor(c1,d1,method = "pearson")
  74.  
  75. #now we have to do chi square test of independence
  76. #that tells you if there is any significant relationship between two nominal variables
  77. #we have to build a matrix set of two nominal variables
  78. #we have to denote survival and non survival and male and and female as two nominal variables
  79. #assign a new variable for converting dummy to categorical variable
  80. survival_binary<-ifelse(df1$Survived,'Survive','Dead')
  81. new<-cbind(survival_binary)
  82. original<-cbind(df1,new)
  83.  
  84. #build a new dataframe where new column is added
  85. df2<-data.frame(original)
  86. View(df2)
  87.  
  88.  
  89. #now we work with df2 dataset
  90. gender<-df2$Sex
  91. survival<-df2$survival_binary
  92. #build a contingency table
  93. table(gender,survival)
  94. help("chisq.test")
  95.  
  96. #assign the table into TAB
  97. TAB=table(gender,survival)
  98. TAB
  99.  
  100. #do barplot of the table
  101. barplot(TAB,beside = T,legend=T)
  102.  
  103. #do chi square test of independence
  104. CHI=chisq.test(TAB,correct = T)
  105. CHI
  106. CHI$expected
  107.  
  108. #do fisher test an alternative of chi square distribution
  109. fisher.test(TAB,conf.int = T,conf.level = 0.95)
  110.  
  111.  
  112. total_survivals<-subset(df2,Survived==1)
  113. #n1 denote as a number of total survival
  114. #n2 denote as number of total children survival
  115. #n4 denote as number of total female survival
  116. #n5 denote as number of total male survival
  117.  
  118.  
  119.  
  120.  
  121.  
  122. child<-subset(total_survivals,Age<18)
  123. #number of child survivers
  124. n2<-nrow(child)
  125. female<-subset(total_survivals,Sex='female')
  126. #number of female survivers
  127. n3<-nrow(female)
  128. n4<-n3-n2
  129. male<-n1-n2-n4
  130. #number of male survivors
  131. n5<-male
  132.  
  133. n1<-nrow(total_survivals)
  134. number_of_total<-nrow(df2)
  135.  
  136. #total dead numbers
  137. dead1=nrow(df2)-(n1)
  138.  
  139. #total survival percentage
  140. total_surv.=(n1/number_of_total)*100
  141. #total dead percentafe
  142. dead=100-(total_surv.)
  143.  
  144.  
  145.  
  146.  
  147. #1 male survival prtcng
  148. total_survivals1<-subset(df2,Survived==1&Sex=='male')
  149. number_of_male_survive_percentage<-(nrow(total_survivals1)/number_of_total)*100
  150.  
  151. #2 female survival prtcng
  152. total_survivals2<-subset(df2,Survived==1&Sex=='female'& Age>18)
  153. number_of_female_survive_percentage<-(nrow(total_survivals2)/number_of_total)*100
  154.  
  155. #3 child survival prcntg
  156. total_survivals3<-subset(df2,survival_binary=='Survive'& Age<18)
  157. number_of_child_survive_percentage<-(nrow(total_survivals3)/number_of_total)*100
  158.  
  159.  
  160. #4 male dead prctng
  161. total_dead1<-subset(df2,survival_binary=='Dead'&Sex=='male')
  162. no_of_male_dead_percentage<-(nrow(total_dead1)/dead1)*100
  163.  
  164. #5 female dead prtcng
  165. no_of_female_dead=0
  166.  
  167. #6 child dead prtcng
  168.  
  169. no_of_child_dead=0
  170.  
  171.  
  172.  
  173. #make a matrix with dead and survival prcntg
  174.  
  175. TAB2<-matrix(c(number_of_male_survive_percentage,number_of_female_survive_percentage,number_of_child_survive_percentage,no_of_male_dead_percentage,no_of_female_dead,no_of_child_dead),ncol = 2)
  176. colnames(TAB2)<-c("Total Survival Percentage","Total Dead Percentage")
  177. row.names(TAB2)<-c("Male","Female","Child")
  178.  
  179. #transform the matrix into a table
  180.  
  181. TAB2<-as.table(TAB2)
  182.  
  183.  
  184. #making a pie chart
  185.  
  186. require(RColorBrewer)
  187. ?brewer.pal
  188.  
  189. indices<-TAB2[,1]!=0
  190. par(mar=c(1,4,4,1))
  191. pie(TAB2[,1],labels = row.names(TAB2),col = brewer.pal(length(TAB2[,1]!=0),'Spectral'),main = 'Survival Rate by Gender')
  192. legend("topleft",legend =row.names(TAB2),fill = brewer.pal(length(TAB2[,1]),'Spectral'))
  193.  
  194.  
  195.  
  196. #identify the percentage of children,female and male survival percentage
  197.  
  198. child_percentage<-(n2/n1)*100
  199. female_percentage<-(n4/n1)*100
  200. male_percentage<-(n5/n1)*100
  201. #create a new dataframe
  202. new_dataframe<-data.frame(child_percentage,female_percentage,male_percentage)
  203. #view the dataframe
  204. View(new_dataframe)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement