View difference between Paste ID: eibQgEm1 and PRydxzC4
SHOW: | | - or go back to the newest paste.
1
####Partial replication & reanalysis of "NHTSA’s Implausible Safety Claim for Tesla’s Autosteer Driver Assistance System" by Quality Control Systems Corp####
2
####This script was written by Aube on Feb 13, 2019####
3
####Feel free to do whatever you want with it as long as you give proper attribution####
4
5
6
library(openxlsx)
7
setwd("K:\\Rfiles\\Data")
8
9
##preliminary steps: load data, trying to replicate QCS report as understood by skimming the intro##
10
teslasheet <- read.xlsx("teslasafety.xlsx",na.strings="-") #this is a copy of spreadsheet: "PE16_007_PRODUCTION DATA" from http://www.safetyresearch.net/Library/2018-11-26%2520Redacted%2520PE16_007_PRODUCT
11
ION%2520DATA_jlq_working_file_10Jan2017%2520.xlsx
12
#teslatable <- read.csv("teslasafety.csv",header=TRUE,sep=",",na.strings=c("-",""),nrows=43781,stringsAsFactors=FALSE) #nrows does not count header row!
13
14
dim(teslasheet)
15
16
redacted_cols <- rep(0,dim(teslasheet)[2])
17
num_redacted <- 0
18
for(i in 1:dim(teslasheet)[2]){
19
	if(grepl("REDACTED",names(teslasheet)[i])){
20
		num_redacted <- num_redacted+1
21
		redacted_cols[num_redacted] <- i
22
	}
23
}
24
25
teslasheet_redacted <- teslasheet[,-redacted_cols[1:num_redacted]]
26
27
teslasheet_red_com <- teslasheet_redacted[teslasheet_redacted[,2]==teslasheet_redacted[,3],]
28
events_before <- sum(teslasheet_red_com[,6],na.rm=T) #note here that cols 6 and 7 show that no car had more than 1 event before/after
29
events_after <- sum(teslasheet_red_com[,7],na.rm=T)
30
31
miles_before <- sum(teslasheet_red_com[,8],na.rm=T)
32
miles_after <- sum(teslasheet_red_com[,9],na.rm=T)
33
34
poisson.test(c(events_before,events_after),c(miles_before,miles_after))
35
#point estimate favours before autosteer, but difference not statistically significant at 95% confidence level
36
37
sum(teslasheet_red_com[,6] & !teslasheet_red_com[,8],na.rm=T)
38
which(teslasheet_red_com[,6] & !teslasheet_red_com[,8])
39
teslasheet_red_com[16543,]
40
#one event in which an event occured before installation of autosteer even though 0 miles driven. Logically impossible observation should be omitted
41
sum(teslasheet_red_com[,7] & !teslasheet_red_com[,9],na.rm=T)
42
which(teslasheet_red_com[,7] & !teslasheet_red_com[,9])
43
teslasheet_red_com[9162,]
44
#also not possible. Miles driven is 0, but had an event
45
events_before <- events_before-1
46
events_after <- events_after-1
47
miles_after <- miles_after-1624
48
49
poisson.test(c(events_before,events_after),c(miles_before,miles_after))
50
#still not significant
51
#doesn't match QCS conclusions, but not particularly unexpected. Will follow their methodology more closely in another pass
52
53
##Confirming NHTSA methodology, following page 8 of QCS Report##
54
sum(teslasheet_redacted[,2]!=teslasheet_redacted[,8],na.rm=T) #assumption about method for calculating miles before autosteer is correct
55
sum(is.na(teslasheet_redacted[,2])!=is.na(teslasheet_redacted[,8])) #oh wait, we have na in one but not another? Let's take a closer look
56
57
teslasheet_redacted[which(is.na(teslasheet_redacted[,2])!=is.na(teslasheet_redacted[,8])),c(2,8)]
58
#messy way of treating data, interchanging NA and 0, but can be lived with
59
60
sum(teslasheet_redacted[,9]!=(teslasheet_redacted[,1]-teslasheet_redacted[,3]),na.rm=T) #assumption about method for calculating miles after autosteer is also corrected
61
62
##Choosing the subset per QCS: previous mileage before autosteer install should match next mileage after install
63
sum(teslasheet_redacted[,2]==teslasheet_redacted[,3],na.rm=T) #20408!=5714. Something's wrong
64
65
sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&!is.na(teslasheet_redacted[,2]),na.rm=T) #explicitly omitting where previous mileage is unknown doesn't help
66
67
sum((teslasheet_redacted[,2]!=teslasheet_redacted[,3]),na.rm=T) #omit where final mileage is unknown, as they mentioned, lowers the number slightly
68
69
sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0),na.rm=T) #restrict to where previous mileage is >0 to ensure comparability - 5719, very close!
70
71
sum((teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1]),na.rm=T) #omit file mileage unknown cases on top of that, 5714, bingo
72
73
complete_only <- (teslasheet_redacted[,2]==teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1])
74
complete_indices <- which(complete_only)
75
teslasheet_com <- teslasheet_redacted[complete_indices,]
76
77
events_before <- sum(teslasheet_com[,4],na.rm=T)
78
events_after <- sum(teslasheet_com[,5],na.rm=T)
79
80
mileage_before <- sum(teslasheet_com[,8],na.rm=T)
81
mileage_after <- sum(teslasheet_com[,9],na.rm=T)
82
#numbers match up with Figure 1 in QCS report
83
84
outcome_QCS <- c(teslasheet_com[,4],teslasheet_com[,5])
85
mileage_QCS <- c(teslasheet_com[,8],teslasheet_com[,9])
86
has_autosteer <- c(rep(0,5714),rep(1,5714))
87
88
model_logit <- glm(outcome_QCS~mileage_QCS+has_autosteer,family=binomial(link="logit")) 
89
summary(model_logit)
90
#results match that of Table 1, doesn't show odds ratios, etc. but when estimates and p-values match, I'm willing to take the rest of the printout on faith
91
92
##Replicating NHTSA part 2##
93
neither_only <- (teslasheet_redacted[,2]==0|is.na(teslasheet_redacted[,2]))&(teslasheet_redacted[,3]==0|is.na(teslasheet_redacted[,3]))&(teslasheet_redacted[,1]>0)&!is.na(teslasheet_redacted[,1])
94
sum(neither_only)
95
#14792 versus 14791 reported by QCS. I'm not sure what exclusion criteria I'm missing
96
#the ones I used are: mileage before not reported or 0, mileage after not reported or 0, total mileage neither unreported nor 0
97
98
99
#I'm not comfortable with interpretting 0 miles as "unreported" without further evidence. Let's see if that choice can be validated
100
teslasheet_redacted[which(neither_only&teslasheet_redacted[,6]),] #one of the paradoxical before events happened where miles before install was reported at 0 (observation 39290). This supports excluding 0 mileage before install cases
101
102
103
##Replicating exposure gap##
104
incomp <- which((teslasheet_redacted[,2]<teslasheet_redacted[,3])&(teslasheet_redacted[,2]>0)&!is.na(teslasheet_redacted[,1])) #mileage before strictly less than mileage after
105
length(incomp)
106
teslasheet_incomp <- teslasheet_redacted[incomp,]
107
gap <- sum(teslasheet_incomp[,3]-teslasheet_incomp[,2]) #total exposure gap miles
108
known_before <- sum(teslasheet_incomp[,8])
109
known_after <- sum(teslasheet_incomp[,9])
110
gap/known_before
111
112
#it's not clear to me how Figures 3A and 3B were constructed since it should vary depending on the order in which the points are added?
113
#this is how I visualize the same data
114
plot(teslasheet_incomp[,8],teslasheet_incomp[,3]-teslasheet_incomp[,2]) #the higher the known miles, the smaller the gap
115
plot(teslasheet_incomp[,9],teslasheet_incomp[,3]-teslasheet_incomp[,2])
116
plot(teslasheet_incomp[,9],teslasheet_incomp[,3]-teslasheet_incomp[,2],xlim=c(0,40000),ylim=c(0,6*10^4)) #omits a few outliers. Maybe a positive correlation?
117
118
119
##Mileage before not reported, mileage after known##
120
one_report <- which(((teslasheet_redacted[,2]==0)|is.na(teslasheet_redacted[,2]))&(teslasheet_redacted[,3]>0)&!is.na(teslasheet_redacted[,3])&!is.na(teslasheet_redacted[,1])&(teslasheet_redacted[,1]>0))
121
length(one_report)
122
teslasheet_one <- teslasheet_redacted[one_report,]
123
gap_one <- sum(teslasheet_one[,3])
124
after_one <- sum(teslasheet_one[,9])
125
126
##The numbers as computed by NHTSA##
127
events_before_all <- sum(teslasheet_redacted[,6],na.rm=T)
128
events_after_all <- sum(teslasheet_redacted[,7],na.rm=T)
129
130
miles_before_all <- sum(teslasheet_redacted[,8],na.rm=T)
131
miles_after_all <- sum(teslasheet_redacted[,9],na.rm=T)
132
133
poisson.test(c(events_before_all,events_after_all),c(miles_before_all,miles_after_all))
134
(events_before_all/miles_before_all-events_after_all/miles_after_all)/(events_before_all/miles_before_all)
135
136
137
##Permutation test (my own contribution to the analysis, based on cases where data is complete)##
138
#H_0: the probability of having an event per mile is the same after the installation of autosteering as before
139
#H_1: the probability of having an event per mile is not the same after the installation of autosteering as before
140
#We will only use the cases where an event happened, and ask "what is the chance of getting a number of events_before at least as extreme as the one we observed if the null hypothesis is true?"
141
#the threshold of significance is 0.05, per convention
142
143
events <- (teslasheet_com[,4]==T)+(teslasheet_com[,5]==T)
144
sum(events) #96 total events
145
146
147
teslasheet_events <- teslasheet_com[which(events>0),]
148
num_events <- teslasheet_events[,4]+teslasheet_events[,5]
149
events_index=1
150
151
events_cutoff <- rep(0,sum(events))
152
for(i in 1:dim(teslasheet_events)[1]){
153
	cutoff <- teslasheet_events[i,8]/teslasheet_events[i,1] #assuming uniform probability of accident for any car, the chance of an accident before install would correspond to the proportion of miles before autosteer
154
	if(num_events[i]==1){
155
		events_cutoff[events_index] <- cutoff
156
		events_index <- events_index+1
157
	}else{
158
		for(i in 1:num_events[i]){
159
			events_cutoff[events_index] <- cutoff
160
			events_index <- events_index+1
161
		}
162
	}
163
}
164
165
sim_num <- 10^6
166
167
set.seed(2007)
168
random_mat <- matrix(runif(sim_num*sum(events)),sum(events),sim_num) #this is a 96x(10^6) matrix of numbers selected from U(0,1) distribution
169
170
result_mat <- matrix(NA,sum(events),sim_num)
171
for(i in 1:sim_num){
172
	result_mat[,i] <- (random_mat[,i]<events_cutoff) #if the random number is smaller than the cutoff, the event happens before installation, otherwise it happens afterwards (implicitly)
173
}
174
175
events_before_BS <- colSums(result_mat) #in this manner, we generate a bootstrap distribution for the number of events before installation
176
sum(events_before_BS<=events_before)/sim_num #0.000295
177
#since we are doing a two-tailed test, we need to double the above number to get the estimated p-value
178
2*sum(events_before_BS<=events_before)/sim_num #0.00059
179
180
pval <- 2*sum(events_before_BS<=events_before)/sim_num #0.00059
181
#this is much, much lower than the threshold, so we reject the null hypothesis