Advertisement
user_137

Sierra DataFrame to SCID

Oct 7th, 2014
269
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.51 KB | None | 0 0
  1. #!/usr/bin/python3
  2. from __future__ import print_function
  3. import numpy as np
  4. import pandas as pd
  5. import struct
  6. import sys
  7. from time import sleep, time
  8. #from SC_Variables import *
  9.  
  10. time_list = []
  11. overrun_list = []
  12. overruns = 0
  13.  
  14. lt = 15
  15. mt = 5
  16. st = 1
  17.  
  18. ohlc = {'O': 'first', 'H': 'max', 'L': 'min', 'C': 'last',
  19.              'V': 'sum', 'x': 'sum', 'y': 'sum', 'z': 'sum'}
  20. cols = ['O', 'H', 'L', 'C', 'V', 'x', 'y', 'z']
  21. time_list = []
  22.  
  23.  
  24.  
  25. class SierraFile(object):
  26.     """        """
  27.     def __init__(self, filename):
  28.         self.filename = str(filename)
  29.         # self.tzAdjust = t imedelta(hours=+10).seconds/d2s
  30.         self.tzAdjust = np.timedelta64(10, 'h') / np.timedelta64(1, 'D')
  31.         self.excelDate = np.datetime64('1899-12-30')
  32.         self.sizeHeader = 0x38
  33.         self.sizeRecord = 0x28
  34.         self.pos = 0
  35.         self.last = 0
  36.  
  37.     def read_existing_records(self):
  38.         with open(self.filename, 'rb') as fscid:
  39.             fscid.read(self.sizeHeader)  # discard header
  40.             rows = []
  41.             ts = []
  42.             for i in range(1000000):
  43.                 data = fscid.read(self.sizeRecord)
  44.                 if data not in ('', b''):
  45.                         d = struct.unpack('d4f4I', data)
  46.                         dt = d[0] + self.tzAdjust
  47.                         ts.append(self.excelDate + np.timedelta64(int(dt))
  48.                                   + (np.timedelta64(int(round((dt - int(dt))
  49.                                      * 86400)), 's')))
  50.                         datarow = [d[1], d[2], d[3], d[4], d[5], 0, 0, 0]
  51.                         rows.append(datarow)
  52.                 else:
  53.                     break
  54.             self.pos = self.last = fscid.tell()
  55.         return (ts, rows)
  56.  
  57.     def read_record(self):
  58.         global overruns, overrun_list
  59.         with open(self.filename, 'rb') as fscid:
  60.             fscid.seek(0, 2)  # Go to the end of the file
  61.             self.last = fscid.tell()
  62.             if self.last == self.pos:  # no new data >> nothing to do
  63.                 return (-999, 0, 0)
  64.             else:  # data to collect
  65.                 if self.pos < self.last - self.sizeRecord:  # > 1 record
  66.                     print('Overrun', self.last - self.pos,
  67.                           (self.last - self.pos) / self.sizeRecord)
  68.                     overruns += 1
  69.                     overrun_list.append(np.datetime64('now'))
  70.                     late_flag = True
  71.                 else:
  72.                     late_flag = False
  73.                 fscid.seek(self.pos, 0)
  74.                 self.pos += self.sizeRecord
  75.                 data = fscid.read(self.sizeRecord)
  76.                 d = struct.unpack('d4f4I', data)
  77.                 dt = d[0] + self.tzAdjust
  78.                 new_time = (self.excelDate + np.timedelta64(int(dt))
  79.                             + (np.timedelta64(int(round((dt - int(dt))
  80.                                * 86400)), 's')))
  81.                 datarow = [d[1], d[2], d[3], d[4], d[5], 0, 0, 0]
  82.                 return (new_time, datarow, late_flag)
  83.  
  84.     def write_existing_records(self, dataframe):
  85.         with open(self.filename, 'wb') as fscid:
  86.             header = b'SCID8\x00\x00\x00(\x00\x00\x00\x01\x00'
  87.             fscid.write(header)
  88.             for i in range(21):
  89.                 fscid.write(b'\x00\x00')
  90.             for i in range(dataframe.end):
  91.                 da = ((dataframe.df.index.values[i] - self.excelDate)
  92.                       / np.timedelta64(1, 'D') - self.tzAdjust)
  93.                 db, dc, dd, de, df, dg, dh, di = dataframe.df.iloc[i][0:8]
  94.                 # include [0:8] so that dataframe can have more than 8 columns
  95.                 di = i  # 0x11100111
  96.                 df = int(df)
  97.                 dg = int(dg)
  98.                 dh = int(dh)
  99.                 di = int(di)
  100.                 wt = struct.pack('d4f4I', da, db, dc, dd, de, df, dg, dh, di)
  101.                 fscid.write(wt)
  102.  
  103.     def write_record(self, dataframe):
  104.         with open(self.filename, 'ab') as fscid:
  105.             i = dataframe.end - 1
  106.             da = ((dataframe.df.index.values[i] - self.excelDate)
  107.                   / np.timedelta64(1, 'D') - self.tzAdjust)
  108.             db, dc, dd, de, df, dg, dh, di = dataframe.df.iloc[i][0:8]
  109.             di = 0x88300388
  110.             df = int(df)
  111.             dg = int(dg)
  112.             dh = int(dh)
  113.             di = int(di)
  114.             record = struct.pack('d4f4I', da, db, dc, dd, de, df, dg, dh, di)
  115.             fscid.write(record)
  116.  
  117.  
  118. class SierraFrame(object):
  119.     """
  120.    DataFrame is the basic object for analysis:
  121.        init reads the .scid file into the initial object, 5 sec assumed
  122.        extend_frame adds 5000 rows to the df because appending rows is slow
  123.        add appends new data in the extended frame for real time operation
  124.        build_tf creates a new dataframe that is a multiplier of the input df
  125.        build_htf_array creates an array showing higher timeframe bars as
  126.          they develop for the lower timeframe array
  127.        countfloats is a test method
  128.    """
  129.     def __init__(self, time_index, data):
  130.         self.df = pd.DataFrame(data, index=time_index,
  131.                                columns=['O', 'H', 'L', 'C', 'V', 'x', 'y', 'z'])
  132.         self.end = len(self.df)
  133.         self.pos = 0
  134.  
  135.     def extend_frame(self):
  136.         '''
  137.        Create a 5000 row array from last time in self.df
  138.         and append it to self.df
  139.        Remove lunch break from array
  140.        '''
  141.         print('Extending DataFrame Now')
  142.         s5 = np.timedelta64(5, 's')
  143.         h1 = np.timedelta64(1, 'h')
  144.         sl = np.datetime64('today') + np.timedelta64(14, 'h')
  145.         el = np.datetime64('today') + np.timedelta64(15, 'h')
  146.         start_time = self.df.index.values[self.end - 1]
  147.         dtgen = ((start_time + i * s5) for i in range(1, 5000))
  148.         dtstrip = ((i + h1 if sl <= i < el else i) for i in dtgen)
  149.         dg = pd.DataFrame(index=dtstrip, columns=self.df.columns)
  150.         #dg.iloc[:] = 0.0
  151.         #dg[[v, x, y, z]] = dg[[v, x, y, z]].astype('int')
  152.         self.df = self.df.append(dg)
  153.         self.df = self.df.astype(np.float64)
  154.  
  155.     def add(self, new_time, datarow):
  156.         '''
  157.        Add a row to an existing extended df but:
  158.            extend if its within 5 of the end
  159.            fill with last bar if its not the next bar
  160.            convert the four integer columns to float for df speed of access
  161.        '''
  162.         if self.end > len(self.df) - 5:
  163.             self.extend_frame()  # not needed if first fill > day length
  164.         np_time = np.datetime64(new_time)
  165.         if np_time < self.df.index.values[self.end]:
  166.             return  # new data is earlier than current
  167.         while np_time > self.df.index.values[self.end]:
  168.             self.df.iloc[self.end] = self.df.iloc[self.end - 1]
  169.             self.end += 1  # fill with prior row if new is later
  170.         for i in [4, 5, 6, 7]:
  171.             datarow[i] = float(datarow[i])
  172.         self.df.iloc[self.end] = datarow  # fill when times match
  173.         #self.df.iloc[self.end] = self.df.iloc[self.end].astype(np.float64)
  174.         self.end += 1
  175.  
  176.     def build_tf(self, ht):
  177.         '''
  178.        Create higher timeframe df that is a multiplier of the input, di
  179.        with ht being the high timeframe bar length in minutes
  180.        '''
  181.         return self.df.resample(str(ht)+'min', how=ohlc)[cols]
  182.  
  183.     def build_htf_array(self, st, ht):
  184.         '''
  185.        Map higher timeframe development on to input df
  186.        with ht being the high timeframe bar length in minutes
  187.        '''
  188.         di = self.df.resample(str(st)+'min', how=ohlc)[cols]
  189.         dih = di.iloc[:,0:5]
  190.         for i in range(len(dih)):
  191.             if i == 0 or i//ht > (i-1)//ht:
  192.                 bO = dih.iloc[i, 0]
  193.                 bH = dih.iloc[i, 1]
  194.                 bL = dih.iloc[i, 2]
  195.                 bC = dih.iloc[i, 3]
  196.             else:
  197.                 dih.iloc[i, 0] = bO
  198.                 dih.iloc[i, 1] = bH = max(bH, dih.iloc[i, 1])
  199.                 dih.iloc[i, 2] = bL = min(bL, dih.iloc[i, 2])
  200.                 bC = dih.iloc[i, 3]
  201.         return dih
  202.  
  203.     def countfloats(self):
  204.         length = len (self.df)
  205.         width = len(self.df.iloc[0])
  206.         floats = 0
  207.         nonfloats = 0
  208.         for i in range(length):
  209.             for j in range(width):
  210.                 if isinstance(self.df.iloc[i,j], float):
  211.                     floats += 1
  212.                 else:
  213.                     nonfloats += 1
  214.         return (floats, nonfloats)
  215.  
  216. def build_htf_array(di, ht):
  217.     '''
  218.    Map higher timeframe development on to input df
  219.    with ht being the high timeframe bar length in minutes
  220.    '''
  221.     dih = di.iloc[:,0:5].copy()
  222.     for i in range(len(dih)):
  223.         if i == 0 or i//ht > (i-1)//ht:
  224.             bO = dih.iloc[i, 0]
  225.             bH = dih.iloc[i, 1]
  226.             bL = dih.iloc[i, 2]
  227.             bC = dih.iloc[i, 3]
  228.         else:
  229.             dih.iloc[i, 0] = bO
  230.             dih.iloc[i, 1] = bH = max(bH, dih.iloc[i, 1])
  231.             dih.iloc[i, 2] = bL = min(bL, dih.iloc[i, 2])
  232.             bC = dih.iloc[i, 3]
  233.     return dih
  234.  
  235. def build_tf(di, ht):
  236.     '''
  237.    Create higher timeframe df that is a multiplier of the input, di
  238.    with ht being the high timeframe bar length in minutes
  239.    '''
  240.     return di.resample(str(ht)+'min', how=ohlc)[cols]
  241.  
  242.  
  243. def dt64_to_str(dt64):
  244.     '''
  245.    Convert numpy datetime64 to string in Ilya's format.
  246.    '''
  247.     s = str(dt64)
  248.     return s[0:4]+s[5:7]+s[8:10]+s[11:13]+s[14:16]+s[17:19]
  249.  
  250.  
  251. def str_to_dt64(s):
  252.     '''
  253.    Convert string in Ilya's format to  numpy datetime64.
  254.    '''
  255.     date = s[0:4] + '-' + s[4:6] + '-' + s[6:8]
  256.     time = 'T' + s[8:10] + ':' + s[10:12] + ':' + s[12:14]
  257.     return np.datetime64(date + time)
  258.  
  259.  
  260. def SierraRun():
  261.     global time_list
  262.     time0 = time()
  263.     #filename = '/home/john/zRamdisk/SierraChart/Data/HSI-201306-HKFE-TD.scid'
  264.     filename = '/home/john/zRamdisk/SierraChart/Data/HSIM13-FUT-HKFE-TD.scid'
  265.     hsi = SierraFile(filename)
  266.     time_index, data = hsi.read_existing_records()
  267.     da = SierraFrame(time_index, data)
  268.     import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
  269.     da.extend_frame()
  270.     wtst = SierraFile('/home/john/zRamdisk/SierraChart/Data/HSI-INPUT.scid')
  271.     wtst.write_existing_records(da)
  272.     print('df ready', da.end - 1, time() - time0)
  273.     print(da.df[da.end - 1:da.end + 1])
  274.     print()
  275.     df = da.df
  276.     print('\n', np.datetime64('now'), da.end)
  277.     print(df[da.end - 5:da.end + 5])
  278.  
  279.     import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
  280.  
  281.  
  282.     #time_list = []
  283.     #for i in range(4000):
  284.         #intime = df.index.values[da.end]
  285.         #time0 = time()
  286.         #da.add(intime, [1.0, 2.0, 3.0, 4.0, 5, 6, 7, 8])
  287.         #time_list.append(time() - time0)
  288.  
  289.     #if time_list:
  290.         #print('TimeStats', max(time_list),
  291.                 #sum(time_list) / len(time_list))
  292.     #print('\nEnd of NaN version')
  293.  
  294.     # print('next', hsi.pos, hsi.last)
  295.     # jtst = SierraFile('/home/john/zRamdisk/SierraChart/Data/HSI-INPUT.scid')
  296.     # time_index, data = jtst.read_existing_records()
  297.     # ja = SierraFrame(time_index, data)
  298.     # jf = ja.df
  299.     # print('\n', ja.end)
  300.     # print(df[ja.end-5:ja.end+5])
  301.     # print('next', jtst.pos, jtst.last)
  302.     # return  # ###################
  303.     counter = 0
  304.     # sys.stdout = os.fdopen(sys.stdout.fileno(), "w", newline=None)
  305.     counter_flag = False
  306.     timer_no_data = time()
  307.     timer_no_data_flag = False
  308.     overruns = 0
  309.     overrun_list = []
  310.     while True:
  311.         time0 = time()
  312.         new_time, data, late_flag = hsi.read_record()
  313.         if new_time != -999:
  314.             #time1 = time()
  315.             da.add(new_time, data)
  316.             #print("{:.6f}".format(time() - time1), end = ' ')
  317.             sys.stdout.flush()
  318.             wtst.write_record(da)
  319.             if counter > 3:
  320.                 time_list.append(time() - time0)
  321.                 timer_no_data = time()
  322.             #print(da.df[da.end-1:da.end], da.end)
  323.             print('.', end=' ')
  324.             sys.stdout.flush()
  325.             if timer_no_data_flag:
  326.                 print('Data Restored')
  327.                 timer_no_data = time()
  328.                 timer_no_data_flag = False
  329.             counter += 1
  330.             counter_flag = True
  331.         if time() - timer_no_data >= 120 and not timer_no_data_flag:
  332.             timer_no_data_flag = True
  333.             print('Data lost for two minutes')
  334.         if not late_flag:
  335.             sleep_time = 0.1 - (time() - time0)
  336.             if sleep_time > 0:
  337.                 sleep(sleep_time)
  338.         if counter % 12 == 0 and counter_flag:
  339.             counter_flag = False
  340.             print('   Overruns:', overruns, overrun_list, end='    ')
  341.             print('TimeStats', "{:.6f} {:.6f}".format(max(time_list),
  342.                   sum(time_list) / len(time_list)), '\n', end=' ')
  343.             # print(df[da.end-1:da.end])
  344.             sys.stdout.flush()
  345.             # break
  346.         if counter % 60 == 0 and counter != 0:
  347.             import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
  348.  
  349.  
  350. def main():
  351.     SierraRun()
  352.  
  353. if __name__ == '__main__':
  354.     """
  355.    Takes a SierraChart scid file (input argument 1) and converts
  356.      it to a Pandas DataFrame
  357.    Timezone conversion can follow the users local timezone, or a
  358.      specified integer (input l or an integer but if the default
  359.      filename is being used, '' must be specified for the filename)
  360.    """
  361.     print('start')
  362.     sys.stdout.flush()
  363.     main()
  364.     print('fin')
  365.     if time_list != []:
  366.         print('TimeStats', "{:.6f} {:.6f}".format(max(time_list),
  367.               sum(time_list) / len(time_list)), '\n', end=' ')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement