Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def vif_cal_iter(inputdata,vif_threshold):
- xvar_names = inputdata.columns
- vif_max = vif_threshold + 1
- def vif_cal(inputdata, xvar_names, vif_max, colnum_max, vif_threshold):
- vif_max = vif_threshold
- for i in range(2,len(xvar_names)):
- train_t = inputdata.rdd.map(lambda x: [Vectors.dense(x[3:i]+x[i+2:]), x[i]]).toDF(['features', 'label'])
- lr = LinearRegression(featuresCol = 'features', labelCol = 'label', maxIter=2)
- lr_model = lr.fit(train_t)
- r_sq = lr_model.summary.r2
- vif=1/(1-r_sq)
- if vif_max < vif:
- vif_max = vif
- colnum_max = i
- return vif_max, colnum_max
- while vif_max > 5:
- vif_max, colnum_max = vif_cal(inputdata, xvar_names, vif_max, colnum_max, vif_threshold)
- if vif_max > vif_threshold:
- print("Start of If Block")
- inputdata = inputdata.drop(inputdata[colnum_max])
- else:
- return inputdata
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement