Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from scipy import stats
- from sklearn.cross_validation import KFold
- male100 = pd.read_csv("data/male100.csv",header=0)
- # Function for giving best line given data set W
- def bestFit(W):
- scaled = scaleData(W,0,0)
- scaled.insert(0,'1s',1)
- X = scaled.as_matrix(["1s","Year"])
- t = W.as_matrix(["Time"])
- return (np.linalg.inv(X.transpose().dot(X))).dot(X.transpose()).dot(t)
- kf = KFold(27, n_folds=4)
- loss=0
- for train_index, test_index in kf:
- def est(x):
- line = bestFit(male100.loc[(train_index),:])[0] + bestFit(male100.loc[(train_index),:])[1]*x
- return line
- loss += sum( (male100.loc[(test_index),:]["Time"] - est(male100.loc[(test_index),:]["Year"]))**2 )
- # Above is a line estimate. We then need to do the loss function calcs.
- print (float(1)/27)*loss
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement