Untitled

import collections
import functools
import smtplib
import pandas as pd
import sklearn.decomposition as deco
import re
from matplotlib.ticker import FuncFormatter
from pandas import Series
import matplotlib
import numpy as np
from numpy import exp, log, inf
import logging
import time
import matplotlib.pyplot as plt


year2decade = lambda x: int(x.year/10)*10
avg_ret_m = lambda x: exp(log(1+x).mean()*12)-1

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

def calcZScore(sr):
    return (sr - sr.mean())/sr.std(ddof=0)

def groupQuantile(sr,n,outlier_upper=None,outlier_lower=None):
    sr = sr.copy()
    if outlier_lower is not None:
        sr[sr<sr.quantile(outlier_lower)] = None
    if outlier_upper is not None:
        sr[sr>sr.quantile(outlier_upper)] = None
    step = 1./n
    lower = -inf
    upper = sr.quantile(step)
    grp = Series(-999,index=sr.index)
    for i in range(1,n+1):
        grp[(sr>lower)&(sr<=upper)] = i
        if i<n:
            lower = upper
            upper = sr.quantile(step*(i+1))
    return grp

def idstats(mapping, cols):
    col1 = cols[0]
    col2 = cols[1]
    orig = mapping.copy()
    mapping = mapping[cols].drop_duplicates().dropna(subset=[col1]).dropna(subset=[col2])
    tot = mapping.shape[0]

    counts = mapping.groupby(col1)[col2].count()
    counts[counts>1] = 'n'
    counts.name = 'col1'
    orig = orig.join(counts, on=col1)

    counts = mapping.groupby(col2)[col1].count()
    counts[counts>1] = 'n'
    counts.name = 'col2'
    orig = orig.join(counts, on=col2)

    orig['type'] = orig['col1'].astype(str) + '-' + orig['col2'].astype(str)
    del orig['col1']; del orig['col2']

    print 'col1: %s\tcol2: %s'%tuple(cols)
    print orig['type'].value_counts().sort_index()
    return orig

def NBER_Shade():
    """Function adds National Bureau of Economic Research (NBER) recession
    bands to a Matplotlib Figure object.
    """

    # load the NBER recession dates
    NBER_Dates = pd.read_csv('/Users/cluo/wyss/utils/NBER Dates.txt')

    # for loop generates recession bands!
    for i in range(NBER_Dates.shape[0]):
        plt.axvspan(NBER_Dates['Peak'][i], NBER_Dates['Trough'][i],
                    facecolor='grey', alpha=0.5)

def annotateAll(ax, positions, labels):
    for pos, label in zip(positions, labels):
        ax.annotate(label, pos)
    return ax

def _to_percent(y, position):
    # Ignore the passed in position. This has the effect of scaling the default
    # tick locations.
    s = str(100 * y)

    # The percent symbol needs escaping in latex
    if matplotlib.rcParams['text.usetex'] == True:
        return s + r'$\%$'
    else:
        return s + '%'

to_percent = FuncFormatter(_to_percent)

def runPCA(x, n=3):
    n_components = n
    pca = deco.PCA(n_components).fit(x)  # n_components is the components number after reduction
    x_r = pca.transform(x)
    print ('explained variance (first %d components): %.2f' % (n_components, sum(pca.explained_variance_ratio_)))
    cols = ["PC%d" % (i+1) for i in range(n)]
    comps = pd.DataFrame(x_r, index=x.index, columns=cols)
    wgt = pd.DataFrame(pca.components_, index=cols, columns=x.columns)
    return comps, wgt, pca

def trying(func):
    def new_func(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            return args[0]
    return new_func

def extractFloat(numstr):
    try:
        return float(re.findall("\d+.\d+", numstr)[0])
    except:
        return None


def toList(x):
    if not isinstance(x, list):
        return [x]
    else:
        return x

def writeTo(string, path, append=False, retry=3):
    success = _writeTo(string, path, append=append)
    if (retry > 0) and (not success):
        tried = 0
        while (tried<retry) and (not success):
            tried += 1
            time.sleep(1)
            success = _writeTo(string, path, append=append)
    if not success:
        raise

def _writeTo(string, path, append=False):
    if append:
        flag = 'a'
    else:
        flag = 'w'
    try:
       fout = open(path, flag)
       fout.writelines(string)
    except:
        logging.exception("is it disk I/O error again?")
        return False
    else:
        fout.close()
    return True

class memoized(object):
    '''Decorator. Caches a function's return value each time it is called.
    If called later with the same arguments, the cached value is returned
    (not reevaluated).
    '''
    def __init__(self, func):
        self.func = func
        self.cache = {}

    def __call__(self, *args):
        if not isinstance(args, collections.Hashable):
            # uncacheable. a list, for instance.
            # better to not cache than blow up.
            return self.func(*args)
        if args in self.cache:
            return self.cache[args]
        else:
            value = self.func(*args)
            self.cache[args] = value
            return value

    def __repr__(self):
        '''Return the function's docstring.'''
        return self.func.__doc__

    def __get__(self, obj, objtype):
        '''Support instance methods.'''
        return functools.partial(self.__call__, obj)


def email(toaddrs, msg):
    fromaddr = 'ultraman.m45@gmail.com'
    username = 'ultraman.m45@gmail.com'
    password = 'beatGodzilla'
    server = smtplib.SMTP('smtp.gmail.com:587')
    server.starttls()
    server.login(username, password)
    server.sendmail(fromaddr, toaddrs, msg)
    server.quit()


def timerange(data):
    if isinstance(data, pd.DataFrame):
        return _timerange(data)
    elif isinstance(data, pd.Panel):
        rez = pd.Panel({item: _timerange(data[item]) for item in data})
        rez = rez.to_frame().unstack()
        rez.index.name = None
        rez.columns.names = [None, None]
        return rez
    else:
        raise Exception()


def _timerange(df):
    rez = {}
    for col in df:
        sr = df[col].dropna()
        rez[col] = {
            'start': sr.index[0].date(),
            'end': sr.index[-1].date(),
            'count': len(sr.index),
        }
    rez = pd.DataFrame(rez).T[['start', 'end', 'count']]
    return rez


def fastDateMDY(x):
    if isinstance(x, str):
        x = [int(x) for x in x.split('/')]
        return pd.datetime(x[2], x[0], x[1])
    else:
        return x

def fastDateYMD(x):
    if isinstance(x, str):
        x = [int(x) for x in x.split('/')]
        return pd.datetime(x[0], x[1], x[2])
    else:
        return x

def convDate(x):
    if not isinstance(x, pd.datetime):
        x = pd.datetools.parse(x)
    return x


def nday(dt, n, day):
    if not isinstance(dt, pd.datetime):
        dt = pd.datetools.parse(dt)
    dt = pd.datetime(dt.year, dt.month, 1)
    to_day = day - dt.weekday()
    to_day = to_day+7 if to_day < 0 else to_day
    dt = dt + pd.datetools.Day(to_day) + pd.datetools.Week(n-1)
    return dt


# remove annoying characters
chars = {
    u'\xc2\x82' : ',',        # High code comma
    u'\xc2\x84' : ',,',       # High code double comma
    u'\xc2\x85' : '...',      # Tripple dot
    u'\xc2\x88' : '^',        # High carat
    u'\xc2\x91' : '\x27',     # Forward single quote
    u'\xc2\x92' : '\x27',     # Reverse single quote
    u'\xc2\x93' : '\x22',     # Forward double quote
    u'\xc2\x94' : '\x22',     # Reverse double quote
    u'\xc2\x95' : ' ',
    u'\xc2\x96' : '-',        # High hyphen
    u'\xc2\x97' : '--',       # Double hyphen
    u'\xc2\x99' : ' ',
    u'\xc2\xa0' : ' ',
    u'\xc2\xa6' : '|',        # Split vertical bar
    u'\xc2\xab' : '<<',       # Double less than
    u'\xc2\xbb' : '>>',       # Double greater than
    u'\xc2\xbc' : '1/4',      # one quarter
    u'\xc2\xbd' : '1/2',      # one half
    u'\xc2\xbe' : '3/4',      # three quarters
    u'\xca\xbf' : '\x27',     # c-single quote
    u'\xcc\xa8' : '',         # modifier - under curve
    u'\xcc\xb1' : ''          # modifier - under line
}
def replace_chars(match):
    char = match.group(0)
    return chars[char]

def beautify(text):
    if isinstance(text, (unicode, str)):
        return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)
    else:
        print "Beautify can't process %s" % text
        return text