Untitled

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.cross_validation import KFold
male100 = pd.read_csv("data/male100.csv",header=0)

# Function for giving best line given data set W
def bestFit(W):
    scaled = scaleData(W,0,0)
    scaled.insert(0,'1s',1)
    X = scaled.as_matrix(["1s","Year"])
    t = W.as_matrix(["Time"])

    return (np.linalg.inv(X.transpose().dot(X))).dot(X.transpose()).dot(t)

kf = KFold(27, n_folds=4)

loss=0
for train_index, test_index in kf:
    def est(x):
        line =  bestFit(male100.loc[(train_index),:])[0] + bestFit(male100.loc[(train_index),:])[1]*x
        return line
    loss += sum( (male100.loc[(test_index),:]["Time"] - est(male100.loc[(test_index),:]["Year"]))**2 )
    # Above is a line estimate. We then need to do the loss function calcs.

print (float(1)/27)*loss