python

from __future__ import division
from scipy import c_, ones, dot, stats, diff
from scipy.linalg import inv, solve, det
import numpy as np
from numpy import log, pi, sqrt, square, diagonal
from numpy.random import randn, seed
import time

class ols:
    """
    Author: Vincent Nijs (+ ?)

    Email: v-nijs at kellogg.northwestern.edu

    Last Modified: Mon Jan 15 17:56:17 CST 2007

    Dependencies: See import statement at the top of this file

    Doc: Class for multi-variate regression using OLS

    For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
    simply run this file using 'python ols.py'. This will generate some simulated data and run various analyses. If you have rpy installed
    the same model will also be estimated by R for confirmation.

    Input:
         dependent variable
        y_varnm = string with the variable label for y
        x = independent variables, note that a constant is added by default
        x_varnm = string or list of variable labels for the independent variables

    Output:
        There are no values returned by the class. Summary provides printed output.
        All other measures can be accessed as follows:

        Step 1: Create an OLS instance by passing data to the class

            m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])

        Step 2: Get specific metrics

            To print the coefficients:
                >>> print m.b
            To print the coefficients p-values:
                >>> print m.p

    """

y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8,
                38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5,
                77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7]
            #tuition
x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891,
                971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179,
                2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291]
            #research and development
x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00,
                72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00,
                151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00,
                267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00,
                397629.00]
            #one/none parents
x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649,
                15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978,
                22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319]

def __init__(self,y,x1,y_varnm = 'y',x_varnm = ''):
    """
    Initializing the ols class.
    """
    self.y = y
    #self.x1 = c_[ones(x1.shape[0]),x1]
    self.y_varnm = y_varnm
    if not isinstance(x_varnm,list):
        self.x_varnm = ['const'] + list(x_varnm)
    else:
        self.x_varnm = ['const'] + x_varnm

    # Estimate model using OLS
    self.estimate()

def estimate(self):

    # estimating coefficients, and basic stats
    self.inv_xx = inv(dot(self.x.T,self.x))
    xy = dot(self.x.T,self.y)
    self.b = dot(self.inv_xx,xy)                    # estimate coefficients

    self.nobs = self.y.shape[0]                     # number of observations
    self.ncoef = self.x.shape[1]                    # number of coef.
    self.df_e = self.nobs - self.ncoef              # degrees of freedom, error
    self.df_r = self.ncoef - 1                      # degrees of freedom, regression

    self.e = self.y - dot(self.x,self.b)            # residuals
    self.sse = dot(self.e,self.e)/self.df_e         # SSE
    self.se = sqrt(diagonal(self.sse*self.inv_xx))  # coef. standard errors
    self.t = self.b / self.se                       # coef. t-statistics
    self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2    # coef. p-values

    self.R2 = 1 - self.e.var()/self.y.var()         # model R-squared
    self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef))   # adjusted R-square

    self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e)  # model F-statistic
    self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e)  # F-statistic p-value

def dw(self):
    """
    Calculates the Durbin-Waston statistic
    """
    de = diff(self.e,1)
    dw = dot(de,de) / dot(self.e,self.e);

    return dw

def omni(self):
    """
    Omnibus test for normality
    """
    return stats.normaltest(self.e)

def JB(self):
    """
    Calculate residual skewness, kurtosis, and do the JB test for normality
    """

    # Calculate residual skewness and kurtosis
    skew = stats.skew(self.e)
    kurtosis = 3 + stats.kurtosis(self.e)

    # Calculate the Jarque-Bera test for normality
    JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
    JBpv = 1-stats.chi2.cdf(JB,2);

    return JB, JBpv, skew, kurtosis

def ll(self):
    """
    Calculate model log-likelihood and two information criteria
    """

    # Model log-likelihood, AIC, and BIC criterion values
    ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
    aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
    bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs

    return ll, aic, bic

def summary(self):
    """
    Printing model output to screen
    """

    # local time & date
    t = time.localtime()

    # extra stats
    ll, aic, bic = self.ll()
    JB, JBpv, skew, kurtosis = self.JB()
    omni, omnipv = self.omni()

    # printing output to screen
    print '\n=============================================================================='
    print "Dependent Variable: " + self.y_varnm
    print "Method: Least Squares"
    print "Date: ", time.strftime("%a, %d %b %Y",t)
    print "Time: ", time.strftime("%H:%M:%S",t)
    print '# obs:               %5.0f' % self.nobs
    print '# variables:     %5.0f' % self.ncoef
    print '=============================================================================='
    print 'variable     coefficient     std. Error      t-statistic     prob.'
    print '=============================================================================='
    for i in range(len(self.x_varnm)):
        print '''% -5s          % -5.6f     % -5.6f     % -5.6f     % -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]])
    print '=============================================================================='
    print 'Models stats                         Residual stats'
    print '=============================================================================='
    print 'R-squared            % -5.6f         Durbin-Watson stat  % -5.6f' % tuple([self.R2, self.dw()])
    print 'Adjusted R-squared   % -5.6f         Omnibus stat        % -5.6f' % tuple([self.R2adj, omni])
    print 'F-statistic          % -5.6f         Prob(Omnibus stat)  % -5.6f' % tuple([self.F, omnipv])
    print 'Prob (F-statistic)   % -5.6f         JB stat             % -5.6f' % tuple([self.Fpv, JB])
    print 'Log likelihood       % -5.6f         Prob(JB)            % -5.6f' % tuple([ll, JBpv])
    print 'AIC criterion        % -5.6f         Skew                % -5.6f' % tuple([aic, skew])
    print 'BIC criterion        % -5.6f         Kurtosis            % -5.6f' % tuple([bic, kurtosis])
    print '=============================================================================='

if __name__ == '__main__':

    ##########################
    ### testing the ols class
    ##########################


    # intercept is added, by default
    m = ols(y, x, y_varnm = 'y',x_varnm = ['x1','x2','x3'])
    m.summary()