Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import collections
- import functools
- import smtplib
- import pandas as pd
- import sklearn.decomposition as deco
- import re
- from matplotlib.ticker import FuncFormatter
- from pandas import Series
- import matplotlib
- import numpy as np
- from numpy import exp, log, inf
- import logging
- import time
- import matplotlib.pyplot as plt
- year2decade = lambda x: int(x.year/10)*10
- avg_ret_m = lambda x: exp(log(1+x).mean()*12)-1
- def chunks(l, n):
- """Yield successive n-sized chunks from l."""
- for i in xrange(0, len(l), n):
- yield l[i:i+n]
- def calcZScore(sr):
- return (sr - sr.mean())/sr.std(ddof=0)
- def groupQuantile(sr,n,outlier_upper=None,outlier_lower=None):
- sr = sr.copy()
- if outlier_lower is not None:
- sr[sr<sr.quantile(outlier_lower)] = None
- if outlier_upper is not None:
- sr[sr>sr.quantile(outlier_upper)] = None
- step = 1./n
- lower = -inf
- upper = sr.quantile(step)
- grp = Series(-999,index=sr.index)
- for i in range(1,n+1):
- grp[(sr>lower)&(sr<=upper)] = i
- if i<n:
- lower = upper
- upper = sr.quantile(step*(i+1))
- return grp
- def idstats(mapping, cols):
- col1 = cols[0]
- col2 = cols[1]
- orig = mapping.copy()
- mapping = mapping[cols].drop_duplicates().dropna(subset=[col1]).dropna(subset=[col2])
- tot = mapping.shape[0]
- counts = mapping.groupby(col1)[col2].count()
- counts[counts>1] = 'n'
- counts.name = 'col1'
- orig = orig.join(counts, on=col1)
- counts = mapping.groupby(col2)[col1].count()
- counts[counts>1] = 'n'
- counts.name = 'col2'
- orig = orig.join(counts, on=col2)
- orig['type'] = orig['col1'].astype(str) + '-' + orig['col2'].astype(str)
- del orig['col1']; del orig['col2']
- print 'col1: %s\tcol2: %s'%tuple(cols)
- print orig['type'].value_counts().sort_index()
- return orig
- def NBER_Shade():
- """Function adds National Bureau of Economic Research (NBER) recession
- bands to a Matplotlib Figure object.
- """
- # load the NBER recession dates
- NBER_Dates = pd.read_csv('/Users/cluo/wyss/utils/NBER Dates.txt')
- # for loop generates recession bands!
- for i in range(NBER_Dates.shape[0]):
- plt.axvspan(NBER_Dates['Peak'][i], NBER_Dates['Trough'][i],
- facecolor='grey', alpha=0.5)
- def annotateAll(ax, positions, labels):
- for pos, label in zip(positions, labels):
- ax.annotate(label, pos)
- return ax
- def _to_percent(y, position):
- # Ignore the passed in position. This has the effect of scaling the default
- # tick locations.
- s = str(100 * y)
- # The percent symbol needs escaping in latex
- if matplotlib.rcParams['text.usetex'] == True:
- return s + r'$\%$'
- else:
- return s + '%'
- to_percent = FuncFormatter(_to_percent)
- def runPCA(x, n=3):
- n_components = n
- pca = deco.PCA(n_components).fit(x) # n_components is the components number after reduction
- x_r = pca.transform(x)
- print ('explained variance (first %d components): %.2f' % (n_components, sum(pca.explained_variance_ratio_)))
- cols = ["PC%d" % (i+1) for i in range(n)]
- comps = pd.DataFrame(x_r, index=x.index, columns=cols)
- wgt = pd.DataFrame(pca.components_, index=cols, columns=x.columns)
- return comps, wgt, pca
- def trying(func):
- def new_func(*args, **kwargs):
- try:
- return func(*args, **kwargs)
- except:
- return args[0]
- return new_func
- def extractFloat(numstr):
- try:
- return float(re.findall("\d+.\d+", numstr)[0])
- except:
- return None
- def toList(x):
- if not isinstance(x, list):
- return [x]
- else:
- return x
- def writeTo(string, path, append=False, retry=3):
- success = _writeTo(string, path, append=append)
- if (retry > 0) and (not success):
- tried = 0
- while (tried<retry) and (not success):
- tried += 1
- time.sleep(1)
- success = _writeTo(string, path, append=append)
- if not success:
- raise
- def _writeTo(string, path, append=False):
- if append:
- flag = 'a'
- else:
- flag = 'w'
- try:
- fout = open(path, flag)
- fout.writelines(string)
- except:
- logging.exception("is it disk I/O error again?")
- return False
- else:
- fout.close()
- return True
- class memoized(object):
- '''Decorator. Caches a function's return value each time it is called.
- If called later with the same arguments, the cached value is returned
- (not reevaluated).
- '''
- def __init__(self, func):
- self.func = func
- self.cache = {}
- def __call__(self, *args):
- if not isinstance(args, collections.Hashable):
- # uncacheable. a list, for instance.
- # better to not cache than blow up.
- return self.func(*args)
- if args in self.cache:
- return self.cache[args]
- else:
- value = self.func(*args)
- self.cache[args] = value
- return value
- def __repr__(self):
- '''Return the function's docstring.'''
- return self.func.__doc__
- def __get__(self, obj, objtype):
- '''Support instance methods.'''
- return functools.partial(self.__call__, obj)
- def email(toaddrs, msg):
- fromaddr = 'ultraman.m45@gmail.com'
- username = 'ultraman.m45@gmail.com'
- password = 'beatGodzilla'
- server = smtplib.SMTP('smtp.gmail.com:587')
- server.starttls()
- server.login(username, password)
- server.sendmail(fromaddr, toaddrs, msg)
- server.quit()
- def timerange(data):
- if isinstance(data, pd.DataFrame):
- return _timerange(data)
- elif isinstance(data, pd.Panel):
- rez = pd.Panel({item: _timerange(data[item]) for item in data})
- rez = rez.to_frame().unstack()
- rez.index.name = None
- rez.columns.names = [None, None]
- return rez
- else:
- raise Exception()
- def _timerange(df):
- rez = {}
- for col in df:
- sr = df[col].dropna()
- rez[col] = {
- 'start': sr.index[0].date(),
- 'end': sr.index[-1].date(),
- 'count': len(sr.index),
- }
- rez = pd.DataFrame(rez).T[['start', 'end', 'count']]
- return rez
- def fastDateMDY(x):
- if isinstance(x, str):
- x = [int(x) for x in x.split('/')]
- return pd.datetime(x[2], x[0], x[1])
- else:
- return x
- def fastDateYMD(x):
- if isinstance(x, str):
- x = [int(x) for x in x.split('/')]
- return pd.datetime(x[0], x[1], x[2])
- else:
- return x
- def convDate(x):
- if not isinstance(x, pd.datetime):
- x = pd.datetools.parse(x)
- return x
- def nday(dt, n, day):
- if not isinstance(dt, pd.datetime):
- dt = pd.datetools.parse(dt)
- dt = pd.datetime(dt.year, dt.month, 1)
- to_day = day - dt.weekday()
- to_day = to_day+7 if to_day < 0 else to_day
- dt = dt + pd.datetools.Day(to_day) + pd.datetools.Week(n-1)
- return dt
- # remove annoying characters
- chars = {
- u'\xc2\x82' : ',', # High code comma
- u'\xc2\x84' : ',,', # High code double comma
- u'\xc2\x85' : '...', # Tripple dot
- u'\xc2\x88' : '^', # High carat
- u'\xc2\x91' : '\x27', # Forward single quote
- u'\xc2\x92' : '\x27', # Reverse single quote
- u'\xc2\x93' : '\x22', # Forward double quote
- u'\xc2\x94' : '\x22', # Reverse double quote
- u'\xc2\x95' : ' ',
- u'\xc2\x96' : '-', # High hyphen
- u'\xc2\x97' : '--', # Double hyphen
- u'\xc2\x99' : ' ',
- u'\xc2\xa0' : ' ',
- u'\xc2\xa6' : '|', # Split vertical bar
- u'\xc2\xab' : '<<', # Double less than
- u'\xc2\xbb' : '>>', # Double greater than
- u'\xc2\xbc' : '1/4', # one quarter
- u'\xc2\xbd' : '1/2', # one half
- u'\xc2\xbe' : '3/4', # three quarters
- u'\xca\xbf' : '\x27', # c-single quote
- u'\xcc\xa8' : '', # modifier - under curve
- u'\xcc\xb1' : '' # modifier - under line
- }
- def replace_chars(match):
- char = match.group(0)
- return chars[char]
- def beautify(text):
- if isinstance(text, (unicode, str)):
- return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)
- else:
- print "Beautify can't process %s" % text
- return text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement