Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- %matplotlib inline
- import os
- import numpy as np
- import matplotlib as mpl
- import matplotlib.pyplot as plt
- import pandas as pd
- import glob
- from joblib import Memory
- from pandas_datareader import wb
- import datetime as dt
- memory = Memory('/tmp/')
- mpl.rcParams.update({'font.size': 10})
- DATA_FILE_PATH = './'
- def load(file_path):
- df = None
- try:
- df = pd.read_csv(file_path)
- except:
- pass
- return df
- def get_file_list(path):
- l = list()
- for d in path:
- p = os.path.join(d, '*.txt*')
- l += glob.glob(p)
- return l
- @memory.cache
- def read_all(path):
- return [load(f) for f in get_file_list(path)]
- def read_date(s):
- dts = dt.datetime.strptime(s, '%Y-%m-%d')
- return dt.date(dts.year, dts.month, dts.day)
- def to_weekday(s):
- td = read_date(s)
- return td.weekday()
- def find_business_day(s, lis):
- today = read_date(s)
- if today.weekday() != 4:
- next_day = today+dt.timedelta(days=1)
- else:
- next_day = today+dt.timedelta(days=3)
- nds = next_day.strftime('%Y-%m-%d')
- return nds if nds in lis else None
- #@memory.cache
- def conv_all(path):
- time_list = [10,11,12,13,14,15,16]
- l = list()
- dfs = read_all(path)
- tt =set()
- for df in dfs:
- if df is not None:
- for date in set(df['Date'].values):
- dfd = df[df['Date']==date]
- next_date = find_business_day(date, df['Date'].values)
- # 始値(夏時間対応注意)
- flag_summer = '14:35:00' in dfd.columns
- df_open = dfd[dfd['Time'] == ('14:35:00' if flag_summer else '15:35:00')]
- dfa = df_open[['Date', 'Open']].set_index('Date').rename(columns={'Open':'p0930'})
- # 時刻別データの列名付け替え
- lis = [dfa]
- for time in time_list:
- rtime = str(time + ((14-9) if flag_summer else (15-9))) + ':00:00'
- ptime = 'p'+str(time)+ '00'
- #print(rtime)
- p = dfd[dfd['Time']==rtime][['Date', 'Close']].set_index('Date').rename(columns={'Close': ptime})
- lis.append(p)
- if next_date is not None:
- dfn = df[df['Date']==next_date] if next_date is not None else None
- df_next_open = dfn[dfn['Time'] == ('14:35:00' if flag_summer else '15:35:00')] # 夏時間と冬時間の境目は無視
- q = df_next_open[['Date', 'Open']].set_index('Date').rename(columns={'Open':'p3330'})
- q = q.values[0] if len(q) > 0 else np.nan
- # q.index = [date]
- #else:
- # q = pd.DataFrame([np.nan], index=[date], columns=['p0930n'])
- q = pd.DataFrame([np.nan if next_date is None else q], index=[date], columns=['p3330'])
- lis.append(q)
- #print(lis)
- df_prices = pd.concat(lis, axis=1)
- # 変化率
- open_price = df_prices['p0930']
- last_price = open_price
- for time in time_list:
- ptime = 'p'+str(time)+ '00'
- ctime = 'c'+str(time)+ '00'
- rtime = 'r'+str(time)+ '00'
- p = df_prices[ptime]
- df_prices[ctime] = (p / open_price).apply(np.log)
- df_prices[rtime] = (p / last_price).apply(np.log)
- del df_prices[ptime]
- last_price = p
- if True:
- ptime = 'p3330'
- ctime = 'c3330'
- rtime = 'r3330'
- p = df_prices[ptime]
- df_prices[ctime] = (p / open_price).apply(np.log)
- df_prices[rtime] = (p / last_price).apply(np.log)
- del df_prices[ptime]
- #print(df_prices)
- #del df_prices['p3330']
- df_prices = df_prices.dropna()
- if len(df_prices.index) > 0:
- l.append(df_prices)
- r = pd.concat(l, axis = 0).drop_duplicates()
- #print(sorted(list(tt)))
- return r
- df_data = conv_all(DATA_FILE_PATH)
- PERCENTILES = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
- def get_cumulative(df):
- return df.describe(percentiles=PERCENTILES)[['c1000','c1100','c1200','c1300','c1400','c1500','c1600', 'c3330']]
- def get_relative(df):
- return df.describe(percentiles=PERCENTILES)[['r1000','r1100','r1200','r1300','r1400','r1500','r1600', 'r3330']]
- def explain(df_data, opt=""):
- df_cumulative = get_cumulative(df_data)
- df_relative = get_relative(df_data)
- fig = plt.figure(figsize=(12, 4))
- ax = fig.add_subplot(1,2,1)
- ax.set_ylim((-0.05, 0.05))
- draw(ax, df_cumulative, 'cumulative', opt)
- bx = fig.add_subplot(1,2,2)
- bx.set_ylim((-0.02, 0.02))
- draw(bx, df_relative, 'relative', opt)
- def draw(ax, df_summary, title, opt):
- for c in df_summary.columns:
- df_summary.rename(columns={c: int(c[1:])}, inplace=True)
- ax.set_title(opt+title+' movement')
- ax.grid()
- if title != 'relative':
- ya = [0]
- ia = [930]
- else:
- ya = ia = []
- y = ya+df_summary.ix['mean',:].values.tolist()
- i = ia+df_summary.ix['mean',:].index.tolist()
- ax.plot(i, y, linewidth=4)
- #ax.plot(df_summary.ix['mean',:], linewidth=4)
- for p in PERCENTILES:
- y = ya+df_summary.ix['{}%'.format(int(p*100)),:].values.tolist()
- ax.plot(i, y)
- explain(df_data)
- def explain_by_weekday(df_data):
- df_wd = df_data.copy()
- df_wd['weekday'] = [to_weekday(t) for t in df_wd.index]
- for d in range(5):
- df = df_wd[df_wd['weekday'] == d]
- print(d, len(df))
- explain(df, ['mon','tue','wed','thr','fri'][d]+'day / ')
- explain_by_weekday(df_data)
- def to_day(s):
- tdt = dt.datetime.strptime(s, '%Y-%m-%d')
- return tdt.day
- def explain_by_day(df_data):
- df_wd = df_data.copy()
- df_wd['day'] = [to_day(t) for t in df_wd.index]
- df = df_wd[df_wd['day']<=10]
- explain(df, 'early month / ')
- df = df_wd[df_wd['day']>10]
- df = df[df['day']<20]
- explain(df, 'mid month / ')
- df = df_wd[df_wd['day']>=20]
- explain(df, 'late month / ')
- explain_by_day(df_data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement