Aug 13th, 2019
1. # obs is a zero-one vector of truth
2. # prob is a vector resulting from "predict_proba"
3. def makecost(obs,prob,falsepos_cost,falseneg_cost):
4.     def cost(cutoff):
5.         pred = np.array(prob > cutoff)
6.         fpos = pred * (1 - obs)
7.         fneg = (1 - pred) * obs
8.         return np.sum(fpos * falsepos_cost + fneg * falseneg_cost)
9.     return np.vectorize(cost)
10.
11. cut = np.linspace(0,1,100)
12. cost = np.zeros_like(cut)
13. from sklearn.model_selection import KFold, cross_val_predict
14. obs = np.ravel(y)
15.
16. K = 20
17. for j in range(K):
18.     folds = KFold(n_splits=5,shuffle=True)
19.     prob = cross_val_predict(logreg,X,np.ravel(y),cv=folds,method='predict_proba',n_jobs=5)[:,1]
20.     getcost = makecost(obs,prob,falsepos_cost=20,falseneg_cost=25)
21.     currentcost = getcost(cut)/X.shape
22.     cost += currentcost
23.     plt.plot(cut, currentcost,c='C0',alpha=0.05)
24. cost /= K
25. plt.plot(cut,cost,c='C0')
26. plt.xlabel('cutoff')
27. plt.ylabel('Expected cost per data point');
28.
29. bestcut = cut[np.argmin(cost)]
30. bestcut
