Guest User

Untitled

a guest
Jun 19th, 2018
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.07 KB | None | 0 0
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4.  
  5.  
  6. dataDir = r'E:\data\bismacro'
  7. rawFile = 'bismacromonthly.txt'
  8. h5File = 'bismacro.h5'
  9. corrVal = 0.97
  10.  
  11.  
  12. def writeMacro(dataDir,rawFile,h5File):
  13. os.chdir(dataDir)
  14. #Convert Ges file to Dataframe and save it to HDF file
  15. lines = []
  16. with open(rawFile, 'r') as fin:
  17. for i,line in enumerate(fin):
  18. tokens = line.split('\t')
  19. if len(tokens) < 5:
  20. continue
  21. _code = [tokens[1].replace('"','')]
  22. _code.extend(tokens[5:])
  23. lines.append(_code)
  24. df = pd.DataFrame(lines[1:],columns=lines[0])
  25. df.index = df.iloc[:,0]
  26. df = df.iloc[:,1:]
  27. df = df.transpose()
  28. df.index = pd.to_datetime([ x.replace('\n','') for x in df.index],format=r'%m/%Y')
  29. df.to_hdf(h5File,'df')
  30. return df
  31.  
  32. def readMacro(dataDir,h5File):
  33. os.chdir(dataDir)
  34. # Read hdf file and select series has all observation for specific time period
  35. df = pd.read_hdf('bismacro.h5','df')
  36. return df
  37.  
  38. def preprocess(df):
  39. # Preprocessing 1 - save obs from Jan 2010 to Dec 2017
  40. df2 = df.loc['2010-1-1':'2017-12-1']
  41. # Preprocessing 2 - remove TS have any missing values
  42. df2 = df2.applymap(lambda x: np.nan if x is None or len(x) == 0 else x)
  43. df2 = df2.dropna(axis=1, how ='any')
  44. df2 = df2.astype(float)
  45. # Preprocessing 3 - remove TS all same values
  46. df2 = df2.loc[:,(df2.diff().iloc[1:,:] != 0).all()]
  47. print('preprocess step : 3 ')
  48. return df2
  49.  
  50. #Remove similar time series which has high correlation
  51. def removeSimilarCorrTS(df, corrVal):
  52. df2 = df.corr()
  53. mapSimilarTS = {}
  54. for i,v in enumerate(df2.columns):
  55. _res = list(df2[df2[v] > corrVal].index)
  56. del _res[_res.index(v)]
  57. mapSimilarTS[v] = _res
  58. lstTs = []
  59. lstTsFinal = []
  60. for i,v in enumerate(mapSimilarTS.keys()):
  61. if i == 0:
  62. lstTsFinal.append(v)
  63. lstTs.extend(mapSimilarTS[v])
  64. else:
  65. if v not in lstTs:
  66. lstTsFinal.append(v)
  67. lstTs.extend(mapSimilarTS[v])
  68. return df[lstTsFinal]
Add Comment
Please, Sign In to add comment