Sierra DataFrame to SCID

#!/usr/bin/python3
from __future__ import print_function
import numpy as np
import pandas as pd
import struct
import sys
from time import sleep, time
#from SC_Variables import *

time_list = []
overrun_list = []
overruns = 0

lt = 15
mt = 5
st = 1

ohlc = {'O': 'first', 'H': 'max', 'L': 'min', 'C': 'last',
             'V': 'sum', 'x': 'sum', 'y': 'sum', 'z': 'sum'}
cols = ['O', 'H', 'L', 'C', 'V', 'x', 'y', 'z']
time_list = []


class SierraFile(object):
    """        """
    def __init__(self, filename):
        self.filename = str(filename)
        # self.tzAdjust = t imedelta(hours=+10).seconds/d2s
        self.tzAdjust = np.timedelta64(10, 'h') / np.timedelta64(1, 'D')
        self.excelDate = np.datetime64('1899-12-30')
        self.sizeHeader = 0x38
        self.sizeRecord = 0x28
        self.pos = 0
        self.last = 0

    def read_existing_records(self):
        with open(self.filename, 'rb') as fscid:
            fscid.read(self.sizeHeader)  # discard header
            rows = []
            ts = []
            for i in range(1000000):
                data = fscid.read(self.sizeRecord)
                if data not in ('', b''):
                        d = struct.unpack('d4f4I', data)
                        dt = d[0] + self.tzAdjust
                        ts.append(self.excelDate + np.timedelta64(int(dt))
                                  + (np.timedelta64(int(round((dt - int(dt))
                                     * 86400)), 's')))
                        datarow = [d[1], d[2], d[3], d[4], d[5], 0, 0, 0]
                        rows.append(datarow)
                else:
                    break
            self.pos = self.last = fscid.tell()
        return (ts, rows)

    def read_record(self):
        global overruns, overrun_list
        with open(self.filename, 'rb') as fscid:
            fscid.seek(0, 2)  # Go to the end of the file
            self.last = fscid.tell()
            if self.last == self.pos:  # no new data >> nothing to do
                return (-999, 0, 0)
            else:  # data to collect
                if self.pos < self.last - self.sizeRecord:  # > 1 record
                    print('Overrun', self.last - self.pos,
                          (self.last - self.pos) / self.sizeRecord)
                    overruns += 1
                    overrun_list.append(np.datetime64('now'))
                    late_flag = True
                else:
                    late_flag = False
                fscid.seek(self.pos, 0)
                self.pos += self.sizeRecord
                data = fscid.read(self.sizeRecord)
                d = struct.unpack('d4f4I', data)
                dt = d[0] + self.tzAdjust
                new_time = (self.excelDate + np.timedelta64(int(dt))
                            + (np.timedelta64(int(round((dt - int(dt))
                               * 86400)), 's')))
                datarow = [d[1], d[2], d[3], d[4], d[5], 0, 0, 0]
                return (new_time, datarow, late_flag)

    def write_existing_records(self, dataframe):
        with open(self.filename, 'wb') as fscid:
            header = b'SCID8\x00\x00\x00(\x00\x00\x00\x01\x00'
            fscid.write(header)
            for i in range(21):
                fscid.write(b'\x00\x00')
            for i in range(dataframe.end):
                da = ((dataframe.df.index.values[i] - self.excelDate)
                      / np.timedelta64(1, 'D') - self.tzAdjust)
                db, dc, dd, de, df, dg, dh, di = dataframe.df.iloc[i][0:8]
                # include [0:8] so that dataframe can have more than 8 columns
                di = i  # 0x11100111
                df = int(df)
                dg = int(dg)
                dh = int(dh)
                di = int(di)
                wt = struct.pack('d4f4I', da, db, dc, dd, de, df, dg, dh, di)
                fscid.write(wt)

    def write_record(self, dataframe):
        with open(self.filename, 'ab') as fscid:
            i = dataframe.end - 1
            da = ((dataframe.df.index.values[i] - self.excelDate)
                  / np.timedelta64(1, 'D') - self.tzAdjust)
            db, dc, dd, de, df, dg, dh, di = dataframe.df.iloc[i][0:8]
            di = 0x88300388
            df = int(df)
            dg = int(dg)
            dh = int(dh)
            di = int(di)
            record = struct.pack('d4f4I', da, db, dc, dd, de, df, dg, dh, di)
            fscid.write(record)


class SierraFrame(object):
    """
    DataFrame is the basic object for analysis:
        init reads the .scid file into the initial object, 5 sec assumed
        extend_frame adds 5000 rows to the df because appending rows is slow
        add appends new data in the extended frame for real time operation
        build_tf creates a new dataframe that is a multiplier of the input df
        build_htf_array creates an array showing higher timeframe bars as
          they develop for the lower timeframe array
        countfloats is a test method
    """
    def __init__(self, time_index, data):
        self.df = pd.DataFrame(data, index=time_index,
                               columns=['O', 'H', 'L', 'C', 'V', 'x', 'y', 'z'])
        self.end = len(self.df)
        self.pos = 0

    def extend_frame(self):
        '''
        Create a 5000 row array from last time in self.df
         and append it to self.df
        Remove lunch break from array
        '''
        print('Extending DataFrame Now')
        s5 = np.timedelta64(5, 's')
        h1 = np.timedelta64(1, 'h')
        sl = np.datetime64('today') + np.timedelta64(14, 'h')
        el = np.datetime64('today') + np.timedelta64(15, 'h')
        start_time = self.df.index.values[self.end - 1]
        dtgen = ((start_time + i * s5) for i in range(1, 5000))
        dtstrip = ((i + h1 if sl <= i < el else i) for i in dtgen)
        dg = pd.DataFrame(index=dtstrip, columns=self.df.columns)
        #dg.iloc[:] = 0.0
        #dg[[v, x, y, z]] = dg[[v, x, y, z]].astype('int')
        self.df = self.df.append(dg)
        self.df = self.df.astype(np.float64)

    def add(self, new_time, datarow):
        '''
        Add a row to an existing extended df but:
            extend if its within 5 of the end
            fill with last bar if its not the next bar
            convert the four integer columns to float for df speed of access
        '''
        if self.end > len(self.df) - 5:
            self.extend_frame()  # not needed if first fill > day length
        np_time = np.datetime64(new_time)
        if np_time < self.df.index.values[self.end]:
            return  # new data is earlier than current
        while np_time > self.df.index.values[self.end]:
            self.df.iloc[self.end] = self.df.iloc[self.end - 1]
            self.end += 1  # fill with prior row if new is later
        for i in [4, 5, 6, 7]:
            datarow[i] = float(datarow[i])
        self.df.iloc[self.end] = datarow  # fill when times match
        #self.df.iloc[self.end] = self.df.iloc[self.end].astype(np.float64)
        self.end += 1

    def build_tf(self, ht):
        '''
        Create higher timeframe df that is a multiplier of the input, di
        with ht being the high timeframe bar length in minutes
        '''
        return self.df.resample(str(ht)+'min', how=ohlc)[cols]

    def build_htf_array(self, st, ht):
        '''
        Map higher timeframe development on to input df
        with ht being the high timeframe bar length in minutes
        '''
        di = self.df.resample(str(st)+'min', how=ohlc)[cols]
        dih = di.iloc[:,0:5]
        for i in range(len(dih)):
            if i == 0 or i//ht > (i-1)//ht:
                bO = dih.iloc[i, 0]
                bH = dih.iloc[i, 1]
                bL = dih.iloc[i, 2]
                bC = dih.iloc[i, 3]
            else:
                dih.iloc[i, 0] = bO
                dih.iloc[i, 1] = bH = max(bH, dih.iloc[i, 1])
                dih.iloc[i, 2] = bL = min(bL, dih.iloc[i, 2])
                bC = dih.iloc[i, 3]
        return dih

    def countfloats(self):
        length = len (self.df)
        width = len(self.df.iloc[0])
        floats = 0
        nonfloats = 0
        for i in range(length):
            for j in range(width):
                if isinstance(self.df.iloc[i,j], float):
                    floats += 1
                else:
                    nonfloats += 1
        return (floats, nonfloats)

def build_htf_array(di, ht):
    '''
    Map higher timeframe development on to input df
    with ht being the high timeframe bar length in minutes
    '''
    dih = di.iloc[:,0:5].copy()
    for i in range(len(dih)):
        if i == 0 or i//ht > (i-1)//ht:
            bO = dih.iloc[i, 0]
            bH = dih.iloc[i, 1]
            bL = dih.iloc[i, 2]
            bC = dih.iloc[i, 3]
        else:
            dih.iloc[i, 0] = bO
            dih.iloc[i, 1] = bH = max(bH, dih.iloc[i, 1])
            dih.iloc[i, 2] = bL = min(bL, dih.iloc[i, 2])
            bC = dih.iloc[i, 3]
    return dih

def build_tf(di, ht):
    '''
    Create higher timeframe df that is a multiplier of the input, di
    with ht being the high timeframe bar length in minutes
    '''
    return di.resample(str(ht)+'min', how=ohlc)[cols]


def dt64_to_str(dt64):
    '''
    Convert numpy datetime64 to string in Ilya's format.
    '''
    s = str(dt64)
    return s[0:4]+s[5:7]+s[8:10]+s[11:13]+s[14:16]+s[17:19]


def str_to_dt64(s):
    '''
    Convert string in Ilya's format to  numpy datetime64.
    '''
    date = s[0:4] + '-' + s[4:6] + '-' + s[6:8]
    time = 'T' + s[8:10] + ':' + s[10:12] + ':' + s[12:14]
    return np.datetime64(date + time)


def SierraRun():
    global time_list
    time0 = time()
    #filename = '/home/john/zRamdisk/SierraChart/Data/HSI-201306-HKFE-TD.scid'
    filename = '/home/john/zRamdisk/SierraChart/Data/HSIM13-FUT-HKFE-TD.scid'
    hsi = SierraFile(filename)
    time_index, data = hsi.read_existing_records()
    da = SierraFrame(time_index, data)
    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
    da.extend_frame()
    wtst = SierraFile('/home/john/zRamdisk/SierraChart/Data/HSI-INPUT.scid')
    wtst.write_existing_records(da)
    print('df ready', da.end - 1, time() - time0)
    print(da.df[da.end - 1:da.end + 1])
    print()
    df = da.df
    print('\n', np.datetime64('now'), da.end)
    print(df[da.end - 5:da.end + 5])

    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT


    #time_list = []
    #for i in range(4000):
        #intime = df.index.values[da.end]
        #time0 = time()
        #da.add(intime, [1.0, 2.0, 3.0, 4.0, 5, 6, 7, 8])
        #time_list.append(time() - time0)

    #if time_list:
        #print('TimeStats', max(time_list),
                #sum(time_list) / len(time_list))
    #print('\nEnd of NaN version')

    # print('next', hsi.pos, hsi.last)
    # jtst = SierraFile('/home/john/zRamdisk/SierraChart/Data/HSI-INPUT.scid')
    # time_index, data = jtst.read_existing_records()
    # ja = SierraFrame(time_index, data)
    # jf = ja.df
    # print('\n', ja.end)
    # print(df[ja.end-5:ja.end+5])
    # print('next', jtst.pos, jtst.last)
    # return  # ###################
    counter = 0
    # sys.stdout = os.fdopen(sys.stdout.fileno(), "w", newline=None)
    counter_flag = False
    timer_no_data = time()
    timer_no_data_flag = False
    overruns = 0
    overrun_list = []
    while True:
        time0 = time()
        new_time, data, late_flag = hsi.read_record()
        if new_time != -999:
            #time1 = time()
            da.add(new_time, data)
            #print("{:.6f}".format(time() - time1), end = ' ')
            sys.stdout.flush()
            wtst.write_record(da)
            if counter > 3:
                time_list.append(time() - time0)
                timer_no_data = time()
            #print(da.df[da.end-1:da.end], da.end)
            print('.', end=' ')
            sys.stdout.flush()
            if timer_no_data_flag:
                print('Data Restored')
                timer_no_data = time()
                timer_no_data_flag = False
            counter += 1
            counter_flag = True
        if time() - timer_no_data >= 120 and not timer_no_data_flag:
            timer_no_data_flag = True
            print('Data lost for two minutes')
        if not late_flag:
            sleep_time = 0.1 - (time() - time0)
            if sleep_time > 0:
                sleep(sleep_time)
        if counter % 12 == 0 and counter_flag:
            counter_flag = False
            print('   Overruns:', overruns, overrun_list, end='    ')
            print('TimeStats', "{:.6f} {:.6f}".format(max(time_list),
                  sum(time_list) / len(time_list)), '\n', end=' ')
            # print(df[da.end-1:da.end])
            sys.stdout.flush()
            # break
        if counter % 60 == 0 and counter != 0:
            import ipdb; ipdb.set_trace()  # XXX BREAKPOINT


def main():
    SierraRun()

if __name__ == '__main__':
    """
    Takes a SierraChart scid file (input argument 1) and converts
      it to a Pandas DataFrame
    Timezone conversion can follow the users local timezone, or a
      specified integer (input l or an integer but if the default
      filename is being used, '' must be specified for the filename)
    """
    print('start')
    sys.stdout.flush()
    main()
    print('fin')
    if time_list != []:
        print('TimeStats', "{:.6f} {:.6f}".format(max(time_list),
              sum(time_list) / len(time_list)), '\n', end=' ')