Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- u"""
- Beta regression for modeling rates and proportions.
- References
- ----------
- Grün, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression
- in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in
- Economics and Statistics, 2011.
- Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer?
- Maximum-likelihood regression with beta-distributed dependent variables."
- Psychological methods 11.1 (2006): 54.
- """
- import numpy as np
- import pandas as pd
- import statsmodels.api as sm
- from scipy.special import gammaln as lgamma
- from statsmodels.base.model import GenericLikelihoodModel
- from statsmodels.genmod.families import Binomial
- # this is only need while #2024 is open.
- class Logit(sm.families.links.Logit):
- """Logit tranform that won't overflow with large numbers."""
- def inverse(self, z):
- return 1 / (1. + np.exp(-z))
- _init_example = """
- Beta regression with default of logit-link for exog and log-link
- for precision.
- >>> mod = Beta(endog, exog)
- >>> rslt = mod.fit()
- >>> print rslt.summary()
- We can also specify a formula and a specific structure and use the
- identity-link for phi.
- >>> from sm.families.links import identity
- >>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
- >>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp',
- ... dat, Z=Z, link_phi=identity())
- In the case of proportion-data, we may think that the precision depends on
- the number of measurements. E.g for sequence data, on the number of
- sequence reads covering a site:
- >>> Z = patsy.dmatrix('~ coverage', df)
- >>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z)
- >>> rslt = mod.fit()
- """
- class Beta(GenericLikelihoodModel):
- """Beta Regression.
- This implementation uses `phi` as a precision parameter equal to
- `a + b` from the Beta parameters.
- """
- def __init__(self, endog, exog, Z=None, link=Logit(),
- link_phi=sm.families.links.Log(), **kwds):
- """
- Parameters
- ----------
- endog : array-like
- 1d array of endogenous values (i.e. responses, outcomes,
- dependent variables, or 'Y' values).
- exog : array-like
- 2d array of exogeneous values (i.e. covariates, predictors,
- independent variables, regressors, or 'X' values). A nobs x k
- array where `nobs` is the number of observations and `k` is
- the number of regressors. An intercept is not included by
- default and should be added by the user. See
- `statsmodels.tools.add_constant`.
- Z : array-like
- 2d array of variables for the precision phi.
- link : link
- Any link in sm.families.links for `exog`
- link_phi : link
- Any link in sm.families.links for `Z`
- Examples
- --------
- {example}
- See Also
- --------
- :ref:`links`
- """.format(example=_init_example)
- assert np.all((0 < endog) & (endog < 1))
- if Z is None:
- extra_names = ['phi']
- Z = np.ones((len(endog), 1), dtype='f')
- else:
- extra_names = ['precision-%s' % zc for zc in \
- (Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))]
- kwds['extra_params_names'] = extra_names
- super(Beta, self).__init__(endog, exog, **kwds)
- self.link = link
- self.link_phi = link_phi
- self.Z = Z
- assert len(self.Z) == len(self.endog)
- def nloglikeobs(self, params):
- """
- Negative log-likelihood.
- Parameters
- ----------
- params : np.ndarray
- Parameter estimates
- """
- return -self._ll_br(self.endog, self.exog, self.Z, params)
- def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False,
- method='bfgs', **kwds):
- """
- Fit the model.
- Parameters
- ----------
- start_params : array-like
- A vector of starting values for the regression
- coefficients. If None, a default is chosen.
- maxiter : integer
- The maximum number of iterations
- disp : bool
- Show convergence stats.
- method : str
- The optimization method to use.
- """
- if start_params is None:
- start_params = sm.GLM(self.endog, self.exog, family=Binomial()
- ).fit(disp=False).params
- start_params = np.append(start_params, [0.5] * self.Z.shape[1])
- return super(Beta, self).fit(start_params=start_params,
- maxiter=maxiter, maxfun=maxfun,
- method=method, disp=disp, **kwds)
- def _ll_br(self, y, X, Z, params):
- nz = self.Z.shape[1]
- Xparams = params[:-nz]
- Zparams = params[-nz:]
- mu = self.link.inverse(np.dot(X, Xparams))
- phi = self.link_phi.inverse(np.dot(Z, Zparams))
- # TODO: derive a and b and constrain to > 0?
- if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf)
- ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \
- + (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \
- * np.log(1 - y)
- return ll
- if __name__ == "__main__":
- import patsy
- dat = pd.read_table('gasoline.txt')
- Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
- # using other precison params with
- m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat,
- Z=Z, link_phi=sm.families.links.identity())
- print m.fit().summary()
- fex = pd.read_csv('foodexpenditure.csv')
- m = Beta.from_formula(' I(food/income) ~ income + persons', fex)
- print m.fit().summary()
- #print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary()
- dev = pd.read_csv('methylation-test.csv')
- Z = patsy.dmatrix('~ age', dev, return_type='dataframe')
- m = Beta.from_formula('methylation ~ gender + CpG', dev,
- Z=Z,
- link_phi=sm.families.links.identity())
- print m.fit().summary()
Add Comment
Please, Sign In to add comment