Advertisement
Guest User

Untitled

a guest
Jun 12th, 2017
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.11 KB | None | 0 0
  1. import collections
  2. import functools
  3. import smtplib
  4. import pandas as pd
  5. import sklearn.decomposition as deco
  6. import re
  7. from matplotlib.ticker import FuncFormatter
  8. from pandas import Series
  9. import matplotlib
  10. import numpy as np
  11. from numpy import exp, log, inf
  12. import logging
  13. import time
  14. import matplotlib.pyplot as plt
  15.  
  16.  
  17. year2decade = lambda x: int(x.year/10)*10
  18. avg_ret_m = lambda x: exp(log(1+x).mean()*12)-1
  19.  
  20. def chunks(l, n):
  21. """Yield successive n-sized chunks from l."""
  22. for i in xrange(0, len(l), n):
  23. yield l[i:i+n]
  24.  
  25. def calcZScore(sr):
  26. return (sr - sr.mean())/sr.std(ddof=0)
  27.  
  28. def groupQuantile(sr,n,outlier_upper=None,outlier_lower=None):
  29. sr = sr.copy()
  30. if outlier_lower is not None:
  31. sr[sr<sr.quantile(outlier_lower)] = None
  32. if outlier_upper is not None:
  33. sr[sr>sr.quantile(outlier_upper)] = None
  34. step = 1./n
  35. lower = -inf
  36. upper = sr.quantile(step)
  37. grp = Series(-999,index=sr.index)
  38. for i in range(1,n+1):
  39. grp[(sr>lower)&(sr<=upper)] = i
  40. if i<n:
  41. lower = upper
  42. upper = sr.quantile(step*(i+1))
  43. return grp
  44.  
  45. def idstats(mapping, cols):
  46. col1 = cols[0]
  47. col2 = cols[1]
  48. orig = mapping.copy()
  49. mapping = mapping[cols].drop_duplicates().dropna(subset=[col1]).dropna(subset=[col2])
  50. tot = mapping.shape[0]
  51.  
  52. counts = mapping.groupby(col1)[col2].count()
  53. counts[counts>1] = 'n'
  54. counts.name = 'col1'
  55. orig = orig.join(counts, on=col1)
  56.  
  57. counts = mapping.groupby(col2)[col1].count()
  58. counts[counts>1] = 'n'
  59. counts.name = 'col2'
  60. orig = orig.join(counts, on=col2)
  61.  
  62. orig['type'] = orig['col1'].astype(str) + '-' + orig['col2'].astype(str)
  63. del orig['col1']; del orig['col2']
  64.  
  65. print 'col1: %s\tcol2: %s'%tuple(cols)
  66. print orig['type'].value_counts().sort_index()
  67. return orig
  68.  
  69. def NBER_Shade():
  70. """Function adds National Bureau of Economic Research (NBER) recession
  71. bands to a Matplotlib Figure object.
  72. """
  73.  
  74. # load the NBER recession dates
  75. NBER_Dates = pd.read_csv('/Users/cluo/wyss/utils/NBER Dates.txt')
  76.  
  77. # for loop generates recession bands!
  78. for i in range(NBER_Dates.shape[0]):
  79. plt.axvspan(NBER_Dates['Peak'][i], NBER_Dates['Trough'][i],
  80. facecolor='grey', alpha=0.5)
  81.  
  82. def annotateAll(ax, positions, labels):
  83. for pos, label in zip(positions, labels):
  84. ax.annotate(label, pos)
  85. return ax
  86.  
  87. def _to_percent(y, position):
  88. # Ignore the passed in position. This has the effect of scaling the default
  89. # tick locations.
  90. s = str(100 * y)
  91.  
  92. # The percent symbol needs escaping in latex
  93. if matplotlib.rcParams['text.usetex'] == True:
  94. return s + r'$\%$'
  95. else:
  96. return s + '%'
  97.  
  98. to_percent = FuncFormatter(_to_percent)
  99.  
  100. def runPCA(x, n=3):
  101. n_components = n
  102. pca = deco.PCA(n_components).fit(x) # n_components is the components number after reduction
  103. x_r = pca.transform(x)
  104. print ('explained variance (first %d components): %.2f' % (n_components, sum(pca.explained_variance_ratio_)))
  105. cols = ["PC%d" % (i+1) for i in range(n)]
  106. comps = pd.DataFrame(x_r, index=x.index, columns=cols)
  107. wgt = pd.DataFrame(pca.components_, index=cols, columns=x.columns)
  108. return comps, wgt, pca
  109.  
  110. def trying(func):
  111. def new_func(*args, **kwargs):
  112. try:
  113. return func(*args, **kwargs)
  114. except:
  115. return args[0]
  116. return new_func
  117.  
  118. def extractFloat(numstr):
  119. try:
  120. return float(re.findall("\d+.\d+", numstr)[0])
  121. except:
  122. return None
  123.  
  124.  
  125. def toList(x):
  126. if not isinstance(x, list):
  127. return [x]
  128. else:
  129. return x
  130.  
  131. def writeTo(string, path, append=False, retry=3):
  132. success = _writeTo(string, path, append=append)
  133. if (retry > 0) and (not success):
  134. tried = 0
  135. while (tried<retry) and (not success):
  136. tried += 1
  137. time.sleep(1)
  138. success = _writeTo(string, path, append=append)
  139. if not success:
  140. raise
  141.  
  142. def _writeTo(string, path, append=False):
  143. if append:
  144. flag = 'a'
  145. else:
  146. flag = 'w'
  147. try:
  148. fout = open(path, flag)
  149. fout.writelines(string)
  150. except:
  151. logging.exception("is it disk I/O error again?")
  152. return False
  153. else:
  154. fout.close()
  155. return True
  156.  
  157. class memoized(object):
  158. '''Decorator. Caches a function's return value each time it is called.
  159. If called later with the same arguments, the cached value is returned
  160. (not reevaluated).
  161. '''
  162. def __init__(self, func):
  163. self.func = func
  164. self.cache = {}
  165.  
  166. def __call__(self, *args):
  167. if not isinstance(args, collections.Hashable):
  168. # uncacheable. a list, for instance.
  169. # better to not cache than blow up.
  170. return self.func(*args)
  171. if args in self.cache:
  172. return self.cache[args]
  173. else:
  174. value = self.func(*args)
  175. self.cache[args] = value
  176. return value
  177.  
  178. def __repr__(self):
  179. '''Return the function's docstring.'''
  180. return self.func.__doc__
  181.  
  182. def __get__(self, obj, objtype):
  183. '''Support instance methods.'''
  184. return functools.partial(self.__call__, obj)
  185.  
  186.  
  187. def email(toaddrs, msg):
  188. fromaddr = 'ultraman.m45@gmail.com'
  189. username = 'ultraman.m45@gmail.com'
  190. password = 'beatGodzilla'
  191. server = smtplib.SMTP('smtp.gmail.com:587')
  192. server.starttls()
  193. server.login(username, password)
  194. server.sendmail(fromaddr, toaddrs, msg)
  195. server.quit()
  196.  
  197.  
  198. def timerange(data):
  199. if isinstance(data, pd.DataFrame):
  200. return _timerange(data)
  201. elif isinstance(data, pd.Panel):
  202. rez = pd.Panel({item: _timerange(data[item]) for item in data})
  203. rez = rez.to_frame().unstack()
  204. rez.index.name = None
  205. rez.columns.names = [None, None]
  206. return rez
  207. else:
  208. raise Exception()
  209.  
  210.  
  211. def _timerange(df):
  212. rez = {}
  213. for col in df:
  214. sr = df[col].dropna()
  215. rez[col] = {
  216. 'start': sr.index[0].date(),
  217. 'end': sr.index[-1].date(),
  218. 'count': len(sr.index),
  219. }
  220. rez = pd.DataFrame(rez).T[['start', 'end', 'count']]
  221. return rez
  222.  
  223.  
  224. def fastDateMDY(x):
  225. if isinstance(x, str):
  226. x = [int(x) for x in x.split('/')]
  227. return pd.datetime(x[2], x[0], x[1])
  228. else:
  229. return x
  230.  
  231. def fastDateYMD(x):
  232. if isinstance(x, str):
  233. x = [int(x) for x in x.split('/')]
  234. return pd.datetime(x[0], x[1], x[2])
  235. else:
  236. return x
  237.  
  238. def convDate(x):
  239. if not isinstance(x, pd.datetime):
  240. x = pd.datetools.parse(x)
  241. return x
  242.  
  243.  
  244. def nday(dt, n, day):
  245. if not isinstance(dt, pd.datetime):
  246. dt = pd.datetools.parse(dt)
  247. dt = pd.datetime(dt.year, dt.month, 1)
  248. to_day = day - dt.weekday()
  249. to_day = to_day+7 if to_day < 0 else to_day
  250. dt = dt + pd.datetools.Day(to_day) + pd.datetools.Week(n-1)
  251. return dt
  252.  
  253.  
  254. # remove annoying characters
  255. chars = {
  256. u'\xc2\x82' : ',', # High code comma
  257. u'\xc2\x84' : ',,', # High code double comma
  258. u'\xc2\x85' : '...', # Tripple dot
  259. u'\xc2\x88' : '^', # High carat
  260. u'\xc2\x91' : '\x27', # Forward single quote
  261. u'\xc2\x92' : '\x27', # Reverse single quote
  262. u'\xc2\x93' : '\x22', # Forward double quote
  263. u'\xc2\x94' : '\x22', # Reverse double quote
  264. u'\xc2\x95' : ' ',
  265. u'\xc2\x96' : '-', # High hyphen
  266. u'\xc2\x97' : '--', # Double hyphen
  267. u'\xc2\x99' : ' ',
  268. u'\xc2\xa0' : ' ',
  269. u'\xc2\xa6' : '|', # Split vertical bar
  270. u'\xc2\xab' : '<<', # Double less than
  271. u'\xc2\xbb' : '>>', # Double greater than
  272. u'\xc2\xbc' : '1/4', # one quarter
  273. u'\xc2\xbd' : '1/2', # one half
  274. u'\xc2\xbe' : '3/4', # three quarters
  275. u'\xca\xbf' : '\x27', # c-single quote
  276. u'\xcc\xa8' : '', # modifier - under curve
  277. u'\xcc\xb1' : '' # modifier - under line
  278. }
  279. def replace_chars(match):
  280. char = match.group(0)
  281. return chars[char]
  282.  
  283. def beautify(text):
  284. if isinstance(text, (unicode, str)):
  285. return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)
  286. else:
  287. print "Beautify can't process %s" % text
  288. return text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement