Untitled

import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
#matplotlib.style.use('ggplot')


xaxis = [7.82, 7.49, 4.97, 5.14, 6.64, 7.6, 8.1, 5.78, 6.42, 6.19, 5.72, 6.71, 6.1, 7.7, 6.77, 7.89, 7.13, 5.92, 5.23, 5.97, 6.05, 7.94, 6.37, 6.44, 7.09, 7.24, 5.82, 6.5, 8.12, 6.82, 6.51, 7.25, 6.49, 6.24, 6.95, 6.3, 7.45, 5.71, 7.38, 7.61, 7.62, 7.01, 5.94, 7.08, 7.1, 6.62, 7.11, 5.89, 6.56, 7.63, 7.8, 5.93, 7.28, 6.08, 7.53, 5.88, 7.98, 7.29, 6.15, 7.23, 7.81, 8.13, 5.99, 6.96, 6.98, 7.71, 6.17, 6.01, 7.91, 7.95, 6.76, 7.85, 6.36, 7.41, 5.96, 6.33, 6.21, 6.61, 7.14, 7.64, 7.34, 6.53, 6.91, 6.97, 7.21, 7.59, 7.96, 6.86, 7.69, 6.0, 6.68, 7.77, 6.92, 7.55, 7.02, 6.83, 7.07, 7.87, 6.27, 7.51, 5.84, 7.79, 6.07, 7.0, 6.52, 5.86, 5.79, 7.05, 5.95, 6.72, 6.69, 6.88, 7.37, 6.25, 7.31, 7.9, 8.0, 6.32, 6.79, 5.69, 7.35, 6.87, 6.22, 6.18, 6.23, 6.74, 7.83, 7.48, 6.43, 6.57, 7.17, 6.78, 7.36, 7.67, 5.9, 6.67, 7.5, 8.04, 7.26, 7.65, 7.2, 6.47, 7.32, 6.46, 6.04, 7.03, 6.65, 7.22, 6.16, 5.75, 6.45, 7.3, 6.39, 6.93, 7.54, 7.99, 5.91, 6.9, 7.4, 5.85, 6.13, 6.59, 5.8, 7.33, 6.6, 7.27, 7.56, 6.26, 6.58, 7.47, 6.12, 6.34, 7.76, 6.4, 7.66, 6.8, 7.57, 6.7, 7.19, 6.14, 7.88, 6.48, 5.81, 6.02, 6.35, 6.28, 7.16, 6.03, 6.29, 7.75, 7.44, 7.58, 5.98, 6.2, 7.86, 6.75, 7.74, 7.06, 7.93, 7.18, 6.89, 6.63, 6.73, 7.78, 6.84, 7.68, 5.87, 5.83, 6.11, 7.52, 6.55, 7.04, 6.54, 6.31, 7.43, 8.14, 7.72, 7.46, 7.73, 6.99, 5.2, 6.38, 6.41, 7.92, 6.94, 6.66, 7.15, 7.39, 6.85, 7.84, 7.12, 8.05, 8.02, 6.09, 6.81, 7.42]

yaxis = [1, 120, 3, 1, 16, 16, 2, 2, 8, 9, 3, 19, 11, 9, 52, 6, 312, 39, 1, 6, 10, 2, 3, 9, 240, 338, 9, 64, 1, 96, 16, 314, 61, 7, 219, 3, 48, 3, 41, 22, 17, 213, 11, 226, 224, 14, 251, 35, 61, 14, 2, 18, 221, 8, 45, 8, 4, 155, 13, 418, 3, 1, 2, 221, 225, 14, 7, 6, 6, 2, 60, 6, 3, 44, 20, 3, 12, 37, 263, 15, 73, 54, 115, 233, 367, 17, 5, 118, 18, 5, 28, 7, 133, 36, 332, 92, 215, 8, 9, 108, 3, 9, 4, 239, 38, 6, 5, 273, 26, 49, 28, 181, 93, 5, 167, 8, 1, 6, 43, 1, 108, 173, 5, 10, 3, 26, 3, 109, 10, 63, 417, 63, 67, 8, 23, 10, 170, 1, 346, 14, 389, 33, 125, 30, 4, 248, 25, 352, 7, 1, 25, 127, 5, 206, 81, 1, 11, 169, 29, 7, 6, 18, 8, 84, 19, 209, 24, 2, 34, 103, 13, 6, 4, 3, 9, 73, 13, 26, 391, 18, 10, 4, 8, 4, 6, 4, 335, 1, 6, 8, 55, 17, 9, 9, 5, 46, 15, 213, 12, 374, 128, 13, 26, 8, 102, 6, 4, 8, 19, 106, 34, 272, 35, 8, 16, 3, 18, 107, 7, 266, 1, 8, 11, 3, 176, 13, 338, 30, 99, 8, 350, 1, 2, 16, 75, 35]

newx = []
newy = []
bothax = []

for (tx, ty) in sorted(zip(xaxis, yaxis)):
    newx.append(tx)
    newy.append(ty)
    bothax.append((tx, ty))

# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, normed=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Distributions to check
    DISTRIBUTIONS = [
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]

    ### st.exponnorm, st.gennorm, st.halfgennorm, st.vonmises_line,st.johnsonsb,

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for distribution in DISTRIBUTIONS:

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                    end
                except Exception:
                    pass

                # identify if this distribution is better
                if best_sse > sse > 0:
                    print("Updating best fit to %s." %distribution)
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass

    return (best_distribution.name, best_params)

def make_pdf(dist, params, size=10000):
    """Generate distributions's Propbability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf

# Load data from statsmodels datasets
#data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())


# Plot for comparison
#plt.figure(figsize=(12,8))
#ax = data.plot(kind='hist', bins=50, normed=True, alpha=0.5, color=plt.rcParams['axes.color_cycle'][1])
ax = plt.plot(newx, newy, 'ro')


# Save plot limits
#dataYLim = ax.get_ylim()

# Find best fit distribution
best_fit_name, best_fir_paramms = best_fit_distribution(bothax) #, 100, ax)

print("best_fit_name =")
print(best_fit_name)
print("best_fir_paramms =")
print(best_fir_paramms)
best_dist = getattr(st, best_fit_name)

# Update plots
#ax.set_ylim(dataYLim)
plt.title(u'All Fitted Distributions')
plt.xlabel(u'Distance (feet)')
plt.ylabel('Frequency')

# Make PDF
pdf = make_pdf(best_dist, best_fir_paramms)

# Display
plt.figure(figsize=(12,8))
ax = pdf.plot(lw=2, label='PDF', legend=True)
#plt.plot(bothax, 'ro')
#data.plot(kind='hist', bins=50, normed=True, alpha=0.5, label='Data', legend=True, ax=ax)
plt.plot(bothax, alpha=0.5, label='Data')


param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fir_paramms)])
dist_str = '{}({})'.format(best_fit_name, param_str)

ax.set_title(u'Just the best fit..n' + dist_str)
ax.set_xlabel(u'Distance')
ax.set_ylabel('Frequency')

plt.show()