Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import pandas as pd
- import numpy as np
- dataDir = r'E:\data\bismacro'
- rawFile = 'bismacromonthly.txt'
- h5File = 'bismacro.h5'
- corrVal = 0.97
- def writeMacro(dataDir,rawFile,h5File):
- os.chdir(dataDir)
- #Convert Ges file to Dataframe and save it to HDF file
- lines = []
- with open(rawFile, 'r') as fin:
- for i,line in enumerate(fin):
- tokens = line.split('\t')
- if len(tokens) < 5:
- continue
- _code = [tokens[1].replace('"','')]
- _code.extend(tokens[5:])
- lines.append(_code)
- df = pd.DataFrame(lines[1:],columns=lines[0])
- df.index = df.iloc[:,0]
- df = df.iloc[:,1:]
- df = df.transpose()
- df.index = pd.to_datetime([ x.replace('\n','') for x in df.index],format=r'%m/%Y')
- df.to_hdf(h5File,'df')
- return df
- def readMacro(dataDir,h5File):
- os.chdir(dataDir)
- # Read hdf file and select series has all observation for specific time period
- df = pd.read_hdf('bismacro.h5','df')
- return df
- def preprocess(df):
- # Preprocessing 1 - save obs from Jan 2010 to Dec 2017
- df2 = df.loc['2010-1-1':'2017-12-1']
- # Preprocessing 2 - remove TS have any missing values
- df2 = df2.applymap(lambda x: np.nan if x is None or len(x) == 0 else x)
- df2 = df2.dropna(axis=1, how ='any')
- df2 = df2.astype(float)
- # Preprocessing 3 - remove TS all same values
- df2 = df2.loc[:,(df2.diff().iloc[1:,:] != 0).all()]
- print('preprocess step : 3 ')
- return df2
- #Remove similar time series which has high correlation
- def removeSimilarCorrTS(df, corrVal):
- df2 = df.corr()
- mapSimilarTS = {}
- for i,v in enumerate(df2.columns):
- _res = list(df2[df2[v] > corrVal].index)
- del _res[_res.index(v)]
- mapSimilarTS[v] = _res
- lstTs = []
- lstTsFinal = []
- for i,v in enumerate(mapSimilarTS.keys()):
- if i == 0:
- lstTsFinal.append(v)
- lstTs.extend(mapSimilarTS[v])
- else:
- if v not in lstTs:
- lstTsFinal.append(v)
- lstTs.extend(mapSimilarTS[v])
- return df[lstTsFinal]
Add Comment
Please, Sign In to add comment