SHOW:
|
|
- or go back to the newest paste.
1 | ####Partial replication & reanalysis of "NHTSA’s Implausible Safety Claim for Tesla’s Autosteer Driver Assistance System" by Quality Control Systems Corp#### | |
2 | ####This script was written by Aube on Feb 13, 2019#### | |
3 | ####Feel free to do whatever you want with it as long as you give proper attribution#### | |
4 | ||
5 | ||
6 | library(openxlsx) | |
7 | setwd("K:\\Rfiles\\Data") | |
8 | ||
9 | ##preliminary steps: load data, trying to replicate QCS report as understood by skimming the intro## | |
10 | teslasheet <- read.xlsx("teslasafety.xlsx",na.strings="-") #this is a copy of spreadsheet: "PE16_007_PRODUCTION DATA" from http://www.safetyresearch.net/Library/2018-11-26%2520Redacted%2520PE16_007_PRODUCT | |
11 | ION%2520DATA_jlq_working_file_10Jan2017%2520.xlsx | |
12 | #teslatable <- read.csv("teslasafety.csv",header=TRUE,sep=",",na.strings=c("-",""),nrows=43781,stringsAsFactors=FALSE) #nrows does not count header row! | |
13 | ||
14 | dim(teslasheet) | |
15 | ||
16 | redacted_cols <- rep(0,dim(teslasheet)[2]) | |
17 | num_redacted <- 0 | |
18 | for(i in 1:dim(teslasheet)[2]){ | |
19 | if(grepl("REDACTED",names(teslasheet)[i])){ | |
20 | num_redacted <- num_redacted+1 | |
21 | redacted_cols[num_redacted] <- i | |
22 | } | |
23 | } | |
24 | ||
25 | teslasheet_redacted <- teslasheet[,-redacted_cols[1:num_redacted]] | |
26 | ||
27 | teslasheet_red_com <- teslasheet_redacted[teslasheet_redacted[,2]==teslasheet_redacted[,3],] | |
28 | events_before <- sum(teslasheet_red_com[,6],na.rm=T) #note here that cols 6 and 7 show that no car had more than 1 event before/after | |
29 | events_after <- sum(teslasheet_red_com[,7],na.rm=T) | |
30 | ||
31 | miles_before <- sum(teslasheet_red_com[,8],na.rm=T) | |
32 | miles_after <- sum(teslasheet_red_com[,9],na.rm=T) | |
33 | ||
34 | poisson.test(c(events_before,events_after),c(miles_before,miles_after)) | |
35 | #point estimate favours before autosteer, but difference not statistically significant at 95% confidence level | |
36 | ||
37 | sum(teslasheet_red_com[,6] & !teslasheet_red_com[,8],na.rm=T) | |
38 | which(teslasheet_red_com[,6] & !teslasheet_red_com[,8]) | |
39 | teslasheet_red_com[16543,] | |
40 | #one event in which an event occured before installation of autosteer even though 0 miles driven. Logically impossible observation should be omitted | |
41 | sum(teslasheet_red_com[,7] & !teslasheet_red_com[,9],na.rm=T) | |
42 | which(teslasheet_red_com[,7] & !teslasheet_red_com[,9]) | |
43 | teslasheet_red_com[9162,] | |
44 | #also not possible. Miles driven is 0, but had an event | |
45 | events_before <- events_before-1 | |
46 | events_after <- events_after-1 | |
47 | miles_after <- miles_after-1624 | |
48 | ||
49 | poisson.test(c(events_before,events_after),c(miles_before,miles_after)) | |
50 | #still not significant | |
51 | #doesn't match QCS conclusions, but not particularly unexpected. Will follow their methodology more closely in another pass | |
52 | ||
53 | ##Confirming NHTSA methodology, following page 8 of QCS Report## | |
54 | sum(teslasheet_redacted[,2]!=teslasheet_redacted[,8],na.rm=T) #assumption about method for calculating miles before autosteer is correct | |
55 | sum(is.na(teslasheet_redacted[,2])!=is.na(teslasheet_redacted[,8])) #oh wait, we have na in one but not another? Let's take a closer look | |
56 | ||
57 | teslasheet_redacted[which(is.na(teslasheet_redacted[,2])!=is.na(teslasheet_redacted[,8])),c(2,8)] | |
58 | #messy way of treating data, interchanging NA and 0, but can be lived with | |
59 | ||
60 | sum(teslasheet_redacted[,9]!=(teslasheet_redacted[,1]-teslasheet_redacted[,3]),na.rm=T) #assumption about method for calculating miles after autosteer is also corrected | |
61 | ||
62 | ##Choosing the subset per QCS: previous mileage before autosteer install should match next mileage after install | |
63 | sum(teslasheet_redacted[,2]==teslasheet_redacted[,3],na.rm=T) #20408!=5714. Something's wrong | |
64 | ||
65 | sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&!is.na(teslasheet_redacted[,2]),na.rm=T) #explicitly omitting where previous mileage is unknown doesn't help | |
66 | ||
67 | sum((teslasheet_redacted[,2]!=teslasheet_redacted[,3]),na.rm=T) #omit where final mileage is unknown, as they mentioned, lowers the number slightly | |
68 | ||
69 | sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0),na.rm=T) #restrict to where previous mileage is >0 to ensure comparability - 5719, very close! | |
70 | ||
71 | sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1]),na.rm=T) #omit file mileage unknown cases on top of that, 5714, bingo | |
72 | ||
73 | complete_only <- (teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1]) | |
74 | complete_indices <- which(complete_only) | |
75 | teslasheet_com <- teslasheet_redacted[complete_indices,] | |
76 | ||
77 | events_before <- sum(teslasheet_com[,4],na.rm=T) | |
78 | events_after <- sum(teslasheet_com[,5],na.rm=T) | |
79 | ||
80 | mileage_before <- sum(teslasheet_com[,8],na.rm=T) | |
81 | mileage_after <- sum(teslasheet_com[,9],na.rm=T) | |
82 | #numbers match up with Figure 1 in QCS report | |
83 | ||
84 | outcome_QCS <- c(teslasheet_com[,4],teslasheet_com[,5]) | |
85 | mileage_QCS <- c(teslasheet_com[,8],teslasheet_com[,9]) | |
86 | has_autosteer <- c(rep(0,5714),rep(1,5714)) | |
87 | ||
88 | model_logit <- glm(outcome_QCS~mileage_QCS+has_autosteer,family=binomial(link="logit")) | |
89 | summary(model_logit) | |
90 | #results match that of Table 1, doesn't show odds ratios, etc. but when estimates and p-values match, I'm willing to take the rest of the printout on faith | |
91 | ||
92 | ##Replicating NHTSA part 2## | |
93 | neither_only <- (teslasheet_redacted[,2]==0|is.na(teslasheet_redacted[,2]))&(teslasheet_redacted[,3]==0|is.na(teslasheet_redacted[,3]))&(teslasheet_redacted[,1]>0)&!is.na(teslasheet_redacted[,1]) | |
94 | sum(neither_only) | |
95 | #14792 versus 14791 reported by QCS. I'm not sure what exclusion criteria I'm missing | |
96 | #the ones I used are: mileage before not reported or 0, mileage after not reported or 0, total mileage neither unreported nor 0 | |
97 | ||
98 | ||
99 | #I'm not comfortable with interpretting 0 miles as "unreported" without further evidence. Let's see if that choice can be validated | |
100 | teslasheet_redacted[which(neither_only&teslasheet_redacted[,6]),] #one of the paradoxical before events happened where miles before install was reported at 0 (observation 39290). This supports excluding 0 mileage before install cases | |
101 | ||
102 | ||
103 | ##Replicating exposure gap## | |
104 | incomp <- which((teslasheet_redacted[,2]<teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1])) #mileage before strictly less than mileage after | |
105 | length(incomp) | |
106 | teslasheet_incomp <- teslasheet_redacted[incomp,] | |
107 | gap <- sum(teslasheet_incomp[,3]-teslasheet_incomp[,2]) #total exposure gap miles | |
108 | known_before <- sum(teslasheet_incomp[,8]) | |
109 | known_after <- sum(teslasheet_incomp[,9]) | |
110 | gap/known_before | |
111 | ||
112 | #it's not clear to me how Figures 3A and 3B were constructed since it should vary depending on the order in which the points are added? | |
113 | #this is how I visualize the same data | |
114 | plot(teslasheet_incomp[,8],teslasheet_incomp[,3]-teslasheet_incomp[,2]) #the higher the known miles, the smaller the gap | |
115 | plot(teslasheet_incomp[,9],teslasheet_incomp[,3]-teslasheet_incomp[,2]) | |
116 | plot(teslasheet_incomp[,9],teslasheet_incomp[,3]-teslasheet_incomp[,2],xlim=c(0,40000),ylim=c(0,6*10^4)) #omits a few outliers. Maybe a positive correlation? | |
117 | ||
118 | ||
119 | ##Mileage before not reported, mileage after known## | |
120 | one_report <- which(((teslasheet_redacted[,2]==0)|is.na(teslasheet_redacted[,2]))&(teslasheet_redacted[,3]>0)&!is.na(teslasheet_redacted[,3])&!is.na(teslasheet_redacted[,1])&(teslasheet_redacted[,1]>0)) | |
121 | length(one_report) | |
122 | teslasheet_one <- teslasheet_redacted[one_report,] | |
123 | gap_one <- sum(teslasheet_one[,3]) | |
124 | after_one <- sum(teslasheet_one[,9]) | |
125 | ||
126 | ##The numbers as computed by NHTSA## | |
127 | events_before_all <- sum(teslasheet_redacted[,6],na.rm=T) | |
128 | events_after_all <- sum(teslasheet_redacted[,7],na.rm=T) | |
129 | ||
130 | miles_before_all <- sum(teslasheet_redacted[,8],na.rm=T) | |
131 | miles_after_all <- sum(teslasheet_redacted[,9],na.rm=T) | |
132 | ||
133 | poisson.test(c(events_before_all,events_after_all),c(miles_before_all,miles_after_all)) | |
134 | (events_before_all/miles_before_all-events_after_all/miles_after_all)/(events_before_all/miles_before_all) | |
135 | ||
136 | ||
137 | ##Permutation test (my own contribution to the analysis, based on cases where data is complete)## | |
138 | #H_0: the probability of having an event per mile is the same after the installation of autosteering as before | |
139 | #H_1: the probability of having an event per mile is not the same after the installation of autosteering as before | |
140 | #We will only use the cases where an event happened, and ask "what is the chance of getting a number of events_before at least as extreme as the one we observed if the null hypothesis is true?" | |
141 | #the threshold of significance is 0.05, per convention | |
142 | ||
143 | events <- (teslasheet_com[,4]==T)+(teslasheet_com[,5]==T) | |
144 | sum(events) #96 total events | |
145 | ||
146 | ||
147 | teslasheet_events <- teslasheet_com[which(events>0),] | |
148 | num_events <- teslasheet_events[,4]+teslasheet_events[,5] | |
149 | events_index=1 | |
150 | ||
151 | events_cutoff <- rep(0,sum(events)) | |
152 | for(i in 1:dim(teslasheet_events)[1]){ | |
153 | cutoff <- teslasheet_events[i,8]/teslasheet_events[i,1] #assuming uniform probability of accident for any car, the chance of an accident before install would correspond to the proportion of miles before autosteer | |
154 | if(num_events[i]==1){ | |
155 | events_cutoff[events_index] <- cutoff | |
156 | events_index <- events_index+1 | |
157 | }else{ | |
158 | for(i in 1:num_events[i]){ | |
159 | events_cutoff[events_index] <- cutoff | |
160 | events_index <- events_index+1 | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | sim_num <- 10^6 | |
166 | ||
167 | set.seed(2007) | |
168 | random_mat <- matrix(runif(sim_num*sum(events)),sum(events),sim_num) #this is a 96x(10^6) matrix of numbers selected from U(0,1) distribution | |
169 | ||
170 | result_mat <- matrix(NA,sum(events),sim_num) | |
171 | for(i in 1:sim_num){ | |
172 | result_mat[,i] <- (random_mat[,i]<events_cutoff) #if the random number is smaller than the cutoff, the event happens before installation, otherwise it happens afterwards (implicitly) | |
173 | } | |
174 | ||
175 | events_before_BS <- colSums(result_mat) #in this manner, we generate a bootstrap distribution for the number of events before installation | |
176 | sum(events_before_BS<=events_before)/sim_num #0.000295 | |
177 | #since we are doing a two-tailed test, we need to double the above number to get the estimated p-value | |
178 | 2*sum(events_before_BS<=events_before)/sim_num #0.00059 | |
179 | ||
180 | pval <- 2*sum(events_before_BS<=events_before)/sim_num #0.00059 | |
181 | #this is much, much lower than the threshold, so we reject the null hypothesis |