daily pastebin goal
67%
SHARE
TWEET

Untitled

a guest Feb 13th, 2018 63 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # -*- coding: utf-8 -*-
  2.  
  3. u"""
  4. Beta regression for modeling rates and proportions.
  5.  
  6. References
  7. ----------
  8. GrĂ¼n, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression
  9. in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in
  10. Economics and Statistics, 2011.
  11.  
  12. Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer?
  13. Maximum-likelihood regression with beta-distributed dependent variables."
  14. Psychological methods 11.1 (2006): 54.
  15. """
  16. import numpy as np
  17. import pandas as pd
  18. import statsmodels.api as sm
  19.  
  20. from scipy.special import gammaln as lgamma
  21.  
  22. from statsmodels.base.model import GenericLikelihoodModel
  23. from statsmodels.genmod.families import Binomial
  24.  
  25. # this is only need while #2024 is open.
  26. class Logit(sm.families.links.Logit):
  27.  
  28.     """Logit tranform that won't overflow with large numbers."""
  29.  
  30.     def inverse(self, z):
  31.         return 1 / (1. + np.exp(-z))
  32.  
  33. _init_example = """
  34.  
  35.     Beta regression with default of logit-link for exog and log-link
  36.     for precision.
  37.  
  38.     >>> mod = Beta(endog, exog)
  39.     >>> rslt = mod.fit()
  40.     >>> print rslt.summary()
  41.  
  42.     We can also specify a formula and a specific structure and use the
  43.     identity-link for phi.
  44.  
  45.     >>> from sm.families.links import identity
  46.     >>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
  47.     >>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp',
  48.     ...                         dat, Z=Z, link_phi=identity())
  49.  
  50.     In the case of proportion-data, we may think that the precision depends on
  51.     the number of measurements. E.g for sequence data, on the number of
  52.     sequence reads covering a site:
  53.  
  54.     >>> Z = patsy.dmatrix('~ coverage', df)
  55.     >>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z)
  56.     >>> rslt = mod.fit()
  57.  
  58. """
  59.  
  60. class Beta(GenericLikelihoodModel):
  61.  
  62.     """Beta Regression.
  63.  
  64.     This implementation uses `phi` as a precision parameter equal to
  65.     `a + b` from the Beta parameters.
  66.     """
  67.  
  68.     def __init__(self, endog, exog, Z=None, link=Logit(),
  69.             link_phi=sm.families.links.Log(), **kwds):
  70.         """
  71.         Parameters
  72.         ----------
  73.         endog : array-like
  74.             1d array of endogenous values (i.e. responses, outcomes,
  75.             dependent variables, or 'Y' values).
  76.         exog : array-like
  77.             2d array of exogeneous values (i.e. covariates, predictors,
  78.             independent variables, regressors, or 'X' values). A nobs x k
  79.             array where `nobs` is the number of observations and `k` is
  80.             the number of regressors. An intercept is not included by
  81.             default and should be added by the user. See
  82.             `statsmodels.tools.add_constant`.
  83.         Z : array-like
  84.             2d array of variables for the precision phi.
  85.         link : link
  86.             Any link in sm.families.links for `exog`
  87.         link_phi : link
  88.             Any link in sm.families.links for `Z`
  89.  
  90.         Examples
  91.         --------
  92.         {example}
  93.  
  94.         See Also
  95.         --------
  96.         :ref:`links`
  97.  
  98.         """.format(example=_init_example)
  99.         assert np.all((0 < endog) & (endog < 1))
  100.         if Z is None:
  101.             extra_names = ['phi']
  102.             Z = np.ones((len(endog), 1), dtype='f')
  103.         else:
  104.             extra_names = ['precision-%s' % zc for zc in \
  105.                         (Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))]
  106.         kwds['extra_params_names'] = extra_names
  107.  
  108.         super(Beta, self).__init__(endog, exog, **kwds)
  109.         self.link = link
  110.         self.link_phi = link_phi
  111.        
  112.         self.Z = Z
  113.         assert len(self.Z) == len(self.endog)
  114.  
  115.     def nloglikeobs(self, params):
  116.         """
  117.         Negative log-likelihood.
  118.  
  119.         Parameters
  120.         ----------
  121.  
  122.         params : np.ndarray
  123.             Parameter estimates
  124.         """
  125.         return -self._ll_br(self.endog, self.exog, self.Z, params)
  126.  
  127.     def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False,
  128.             method='bfgs', **kwds):
  129.         """
  130.         Fit the model.
  131.  
  132.         Parameters
  133.         ----------
  134.         start_params : array-like
  135.             A vector of starting values for the regression
  136.             coefficients.  If None, a default is chosen.
  137.         maxiter : integer
  138.             The maximum number of iterations
  139.         disp : bool
  140.             Show convergence stats.
  141.         method : str
  142.             The optimization method to use.
  143.         """
  144.  
  145.         if start_params is None:
  146.             start_params = sm.GLM(self.endog, self.exog, family=Binomial()
  147.                                  ).fit(disp=False).params
  148.             start_params = np.append(start_params, [0.5] * self.Z.shape[1])
  149.  
  150.         return super(Beta, self).fit(start_params=start_params,
  151.                                         maxiter=maxiter, maxfun=maxfun,
  152.                                         method=method, disp=disp, **kwds)
  153.  
  154.     def _ll_br(self, y, X, Z, params):
  155.         nz = self.Z.shape[1]
  156.  
  157.         Xparams = params[:-nz]
  158.         Zparams = params[-nz:]
  159.  
  160.         mu = self.link.inverse(np.dot(X, Xparams))
  161.         phi = self.link_phi.inverse(np.dot(Z, Zparams))
  162.         # TODO: derive a and b and constrain to > 0?
  163.  
  164.         if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf)
  165.  
  166.         ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \
  167.                 + (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \
  168.                 * np.log(1 - y)
  169.  
  170.         return ll
  171.  
  172. if __name__ == "__main__":
  173.  
  174.     import patsy
  175.     dat = pd.read_table('gasoline.txt')
  176.     Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
  177.     # using other precison params with
  178.     m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat,
  179.             Z=Z, link_phi=sm.families.links.identity())
  180.     print m.fit().summary()
  181.  
  182.     fex = pd.read_csv('foodexpenditure.csv')
  183.     m = Beta.from_formula(' I(food/income) ~ income + persons', fex)
  184.     print m.fit().summary()
  185.     #print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary()
  186.  
  187.     dev = pd.read_csv('methylation-test.csv')
  188.     Z = patsy.dmatrix('~ age', dev, return_type='dataframe')
  189.     m = Beta.from_formula('methylation ~ gender + CpG', dev,
  190.             Z=Z,
  191.             link_phi=sm.families.links.identity())
  192.     print m.fit().summary()
RAW Paste Data
Top