Guest User

Replication/reanalysis of QCS's Feb 2019 Tesla report

a guest
Feb 13th, 2019
679
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ####Partial replication & reanalysis of "NHTSA’s Implausible Safety Claim for Tesla’s Autosteer Driver Assistance System" by Quality Control Systems Corp####
  2. ####This script was written by Aube on Feb 13, 2019####
  3. ####Feel free to do whatever you want with it as long as you give proper attribution####
  4.  
  5.  
  6. library(openxlsx)
  7. setwd("K:\\Rfiles\\Data")
  8.  
  9. ##preliminary steps: load data, trying to replicate QCS report as understood by skimming the intro##
  10. teslasheet <- read.xlsx("teslasafety.xlsx",na.strings="-") #this is a copy of spreadsheet: "PE16_007_PRODUCTION DATA" from http://www.safetyresearch.net/Library/2018-11-26%2520Redacted%2520PE16_007_PRODUCT
  11. ION%2520DATA_jlq_working_file_10Jan2017%2520.xlsx
  12. #teslatable <- read.csv("teslasafety.csv",header=TRUE,sep=",",na.strings=c("-",""),nrows=43781,stringsAsFactors=FALSE) #nrows does not count header row!
  13.  
  14. dim(teslasheet)
  15.  
  16. redacted_cols <- rep(0,dim(teslasheet)[2])
  17. num_redacted <- 0
  18. for(i in 1:dim(teslasheet)[2]){
  19.     if(grepl("REDACTED",names(teslasheet)[i])){
  20.         num_redacted <- num_redacted+1
  21.         redacted_cols[num_redacted] <- i
  22.     }
  23. }
  24.  
  25. teslasheet_redacted <- teslasheet[,-redacted_cols[1:num_redacted]]
  26.  
  27. teslasheet_red_com <- teslasheet_redacted[teslasheet_redacted[,2]==teslasheet_redacted[,3],]
  28. events_before <- sum(teslasheet_red_com[,6],na.rm=T) #note here that cols 6 and 7 show that no car had more than 1 event before/after
  29. events_after <- sum(teslasheet_red_com[,7],na.rm=T)
  30.  
  31. miles_before <- sum(teslasheet_red_com[,8],na.rm=T)
  32. miles_after <- sum(teslasheet_red_com[,9],na.rm=T)
  33.  
  34. poisson.test(c(events_before,events_after),c(miles_before,miles_after))
  35. #point estimate favours before autosteer, but difference not statistically significant at 95% confidence level
  36.  
  37. sum(teslasheet_red_com[,6] & !teslasheet_red_com[,8],na.rm=T)
  38. which(teslasheet_red_com[,6] & !teslasheet_red_com[,8])
  39. teslasheet_red_com[16543,]
  40. #one event in which an event occured before installation of autosteer even though 0 miles driven. Logically impossible observation should be omitted
  41. sum(teslasheet_red_com[,7] & !teslasheet_red_com[,9],na.rm=T)
  42. which(teslasheet_red_com[,7] & !teslasheet_red_com[,9])
  43. teslasheet_red_com[9162,]
  44. #also not possible. Miles driven is 0, but had an event
  45. events_before <- events_before-1
  46. events_after <- events_after-1
  47. miles_after <- miles_after-1624
  48.  
  49. poisson.test(c(events_before,events_after),c(miles_before,miles_after))
  50. #still not significant
  51. #doesn't match QCS conclusions, but not particularly unexpected. Will follow their methodology more closely in another pass
  52.  
  53. ##Confirming NHTSA methodology, following page 8 of QCS Report##
  54. sum(teslasheet_redacted[,2]!=teslasheet_redacted[,8],na.rm=T) #assumption about method for calculating miles before autosteer is correct
  55. sum(is.na(teslasheet_redacted[,2])!=is.na(teslasheet_redacted[,8])) #oh wait, we have na in one but not another? Let's take a closer look
  56.  
  57. teslasheet_redacted[which(is.na(teslasheet_redacted[,2])!=is.na(teslasheet_redacted[,8])),c(2,8)]
  58. #messy way of treating data, interchanging NA and 0, but can be lived with
  59.  
  60. sum(teslasheet_redacted[,9]!=(teslasheet_redacted[,1]-teslasheet_redacted[,3]),na.rm=T) #assumption about method for calculating miles after autosteer is also corrected
  61.  
  62. ##Choosing the subset per QCS: previous mileage before autosteer install should match next mileage after install
  63. sum(teslasheet_redacted[,2]==teslasheet_redacted[,3],na.rm=T) #20408!=5714. Something's wrong
  64.  
  65. sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&!is.na(teslasheet_redacted[,2]),na.rm=T) #explicitly omitting where previous mileage is unknown doesn't help
  66.  
  67. sum((teslasheet_redacted[,2]!=teslasheet_redacted[,3]),na.rm=T) #omit where final mileage is unknown, as they mentioned, lowers the number slightly
  68.  
  69. sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0),na.rm=T) #restrict to where previous mileage is >0 to ensure comparability - 5719, very close!
  70.  
  71. sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1]),na.rm=T) #omit file mileage unknown cases on top of that, 5714, bingo
  72.  
  73. complete_only <- (teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1])
  74. complete_indices <- which(complete_only)
  75. teslasheet_com <- teslasheet_redacted[complete_indices,]
  76.  
  77. events_before <- sum(teslasheet_com[,4],na.rm=T)
  78. events_after <- sum(teslasheet_com[,5],na.rm=T)
  79.  
  80. mileage_before <- sum(teslasheet_com[,8],na.rm=T)
  81. mileage_after <- sum(teslasheet_com[,9],na.rm=T)
  82. #numbers match up with Figure 1 in QCS report
  83.  
  84. outcome_QCS <- c(teslasheet_com[,4],teslasheet_com[,5])
  85. mileage_QCS <- c(teslasheet_com[,8],teslasheet_com[,9])
  86. has_autosteer <- c(rep(0,5714),rep(1,5714))
  87.  
  88. model_logit <- glm(outcome_QCS~mileage_QCS+has_autosteer,family=binomial(link="logit"))
  89. summary(model_logit)
  90. #results match that of Table 1, doesn't show odds ratios, etc. but when estimates and p-values match, I'm willing to take the rest of the printout on faith
  91.  
  92. ##Replicating NHTSA part 2##
  93. neither_only <- (teslasheet_redacted[,2]==0|is.na(teslasheet_redacted[,2]))&(teslasheet_redacted[,3]==0|is.na(teslasheet_redacted[,3]))&(teslasheet_redacted[,1]>0)&!is.na(teslasheet_redacted[,1])
  94. sum(neither_only)
  95. #14792 versus 14791 reported by QCS. I'm not sure what exclusion criteria I'm missing
  96. #the ones I used are: mileage before not reported or 0, mileage after not reported or 0, total mileage neither unreported nor 0
  97.  
  98.  
  99. #I'm not comfortable with interpretting 0 miles as "unreported" without further evidence. Let's see if that choice can be validated
  100. teslasheet_redacted[which(neither_only&teslasheet_redacted[,6]),] #one of the paradoxical before events happened where miles before install was reported at 0 (observation 39290). This supports excluding 0 mileage before install cases
  101.  
  102.  
  103. ##Replicating exposure gap##
  104. incomp <- which((teslasheet_redacted[,2]<teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1])) #mileage before strictly less than mileage after
  105. length(incomp)
  106. teslasheet_incomp <- teslasheet_redacted[incomp,]
  107. gap <- sum(teslasheet_incomp[,3]-teslasheet_incomp[,2]) #total exposure gap miles
  108. known_before <- sum(teslasheet_incomp[,8])
  109. known_after <- sum(teslasheet_incomp[,9])
  110. gap/known_before
  111.  
  112. #it's not clear to me how Figures 3A and 3B were constructed since it should vary depending on the order in which the points are added?
  113. #this is how I visualize the same data
  114. plot(teslasheet_incomp[,8],teslasheet_incomp[,3]-teslasheet_incomp[,2]) #the higher the known miles, the smaller the gap
  115. plot(teslasheet_incomp[,9],teslasheet_incomp[,3]-teslasheet_incomp[,2])
  116. plot(teslasheet_incomp[,9],teslasheet_incomp[,3]-teslasheet_incomp[,2],xlim=c(0,40000),ylim=c(0,6*10^4)) #omits a few outliers. Maybe a positive correlation?
  117.  
  118.  
  119. ##Mileage before not reported, mileage after known##
  120. one_report <- which(((teslasheet_redacted[,2]==0)|is.na(teslasheet_redacted[,2]))&(teslasheet_redacted[,3]>0)&!is.na(teslasheet_redacted[,3])&!is.na(teslasheet_redacted[,1])&(teslasheet_redacted[,1]>0))
  121. length(one_report)
  122. teslasheet_one <- teslasheet_redacted[one_report,]
  123. gap_one <- sum(teslasheet_one[,3])
  124. after_one <- sum(teslasheet_one[,9])
  125.  
  126. ##The numbers as computed by NHTSA##
  127. events_before_all <- sum(teslasheet_redacted[,6],na.rm=T)
  128. events_after_all <- sum(teslasheet_redacted[,7],na.rm=T)
  129.  
  130. miles_before_all <- sum(teslasheet_redacted[,8],na.rm=T)
  131. miles_after_all <- sum(teslasheet_redacted[,9],na.rm=T)
  132.  
  133. poisson.test(c(events_before_all,events_after_all),c(miles_before_all,miles_after_all))
  134. (events_before_all/miles_before_all-events_after_all/miles_after_all)/(events_before_all/miles_before_all)
  135.  
  136.  
  137. ##Permutation test (my own contribution to the analysis, based on cases where data is complete)##
  138. #H_0: the probability of having an event per mile is the same after the installation of autosteering as before
  139. #H_1: the probability of having an event per mile is not the same after the installation of autosteering as before
  140. #We will only use the cases where an event happened, and ask "what is the chance of getting a number of events_before at least as extreme as the one we observed if the null hypothesis is true?"
  141. #the threshold of significance is 0.05, per convention
  142.  
  143. events <- (teslasheet_com[,4]==T)+(teslasheet_com[,5]==T)
  144. sum(events) #96 total events
  145.  
  146.  
  147. teslasheet_events <- teslasheet_com[which(events>0),]
  148. num_events <- teslasheet_events[,4]+teslasheet_events[,5]
  149. events_index=1
  150.  
  151. events_cutoff <- rep(0,sum(events))
  152. for(i in 1:dim(teslasheet_events)[1]){
  153.     cutoff <- teslasheet_events[i,8]/teslasheet_events[i,1] #assuming uniform probability of accident for any car, the chance of an accident before install would correspond to the proportion of miles before autosteer
  154.     if(num_events[i]==1){
  155.         events_cutoff[events_index] <- cutoff
  156.         events_index <- events_index+1
  157.     }else{
  158.         for(i in 1:num_events[i]){
  159.             events_cutoff[events_index] <- cutoff
  160.             events_index <- events_index+1
  161.         }
  162.     }
  163. }
  164.  
  165. sim_num <- 10^6
  166.  
  167. set.seed(2007)
  168. random_mat <- matrix(runif(sim_num*sum(events)),sum(events),sim_num) #this is a 96x(10^6) matrix of numbers selected from U(0,1) distribution
  169.  
  170. result_mat <- matrix(NA,sum(events),sim_num)
  171. for(i in 1:sim_num){
  172.     result_mat[,i] <- (random_mat[,i]<events_cutoff) #if the random number is smaller than the cutoff, the event happens before installation, otherwise it happens afterwards (implicitly)
  173. }
  174.  
  175. events_before_BS <- colSums(result_mat) #in this manner, we generate a bootstrap distribution for the number of events before installation
  176. sum(events_before_BS<=events_before)/sim_num #0.000295
  177. #since we are doing a two-tailed test, we need to double the above number to get the estimated p-value
  178. 2*sum(events_before_BS<=events_before)/sim_num #0.00059
  179.  
  180. pval <- 2*sum(events_before_BS<=events_before)/sim_num #0.00059
  181. #this is much, much lower than the threshold, so we reject the null hypothesis
RAW Paste Data