makiolo

dataframe python

Feb 13th, 2017
132
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import copy
  2. import logging
  3. from collections import OrderedDict
  4.  
  5. class dataframe(OrderedDict):
  6.     def __init__(self, l=[]):
  7.         super(dataframe, self).__init__(l)
  8.         self._names = tuple()
  9.  
  10.     def copy(self):
  11.         df = super(dataframe, self).copy()
  12.         df._names = copy.deepcopy(self.cols())
  13.         return df
  14.  
  15.     def cols(self):
  16.         return self._names
  17.  
  18.     def rows(self):
  19.         return tuple(self.keys())
  20.  
  21.     def __str__(self):
  22.         buff = '\t'
  23.         buff += '\t'.join(self.cols())
  24.         buff += '\n'
  25.         for k,values in self.iteritems():
  26.             buff += '{}\t'.format(k)
  27.             for v in values:
  28.                 buff += '{}\t'.format(v)
  29.             buff += '\n'
  30.         return buff
  31.  
  32.     @staticmethod
  33.     def dcast(dataset, row, col, value, functor = lambda x,y: x+y, neutral=0.0):
  34.         '''
  35.        casting to wide format (similar to dcast in reshape2@R)
  36.  
  37.        dataset is a list of tuples
  38.        row col and value are index of tuple
  39.        func aggregation is sum() in hardcoded way
  40.        '''
  41.         col_uniques = []
  42.         row_uniques = []
  43.         for tupl in dataset:
  44.             if tupl[col] not in col_uniques:
  45.                 col_uniques.append(tupl[col])
  46.             if tupl[row] not in row_uniques:
  47.                 row_uniques.append(tupl[row])
  48.         result = dataframe()
  49.         names = []
  50.         for tupl_col in col_uniques:
  51.             names.append(tupl_col)
  52.         result._names = tuple(names)
  53.         for tupl_row in row_uniques:
  54.             row_list = []
  55.             for tupl_col in col_uniques:
  56.                 total = []
  57.                 for tupl in dataset:
  58.                     if (tupl[row] == tupl_row) and (tupl[col] == tupl_col):
  59.                         total.append( float(tupl[value]) )
  60.                 if len(total) > 0:
  61.                     reduced = reduce(functor, total)
  62.                 else:
  63.                     reduced = neutral
  64.                 row_list.append(reduced)
  65.             result[tupl_row] = tuple(row_list)
  66.         return result
  67.  
  68.     def melt(self, permutation=False):
  69.         '''
  70.        undo pivot table
  71.  
  72.        return a list of tuples (row, col, value) or (col, row, value) (with permutation=True)
  73.        '''
  74.         dataset = []
  75.         for k, values in self.iteritems():
  76.             for c, v in enumerate(values):
  77.                 if not permutation:
  78.                     dataset.append( (k, self.cols()[c], v) )
  79.                 else:
  80.                     dataset.append( (self.cols()[c], k, v) )
  81.         return dataset
  82.  
  83.     @staticmethod
  84.     def dindex_col(df, col_name):
  85.         '''
  86.        get index from column name
  87.        '''
  88.         for i in range(len(df.cols())):
  89.             if df.cols()[i] == col_name:
  90.                 break
  91.         else:
  92.             raise Exception('col name: {} is not found'.format(col_name))
  93.         return i
  94.  
  95.     @staticmethod
  96.     def dindex_row(df, row_name):
  97.         '''
  98.        get index from row name
  99.        '''
  100.         for i in range(len(df.rows())):
  101.             if df.rows()[i] == row_name:
  102.                 break
  103.         else:
  104.             raise Exception('row name: {} is not found'.format(row_name))
  105.         return i
  106.  
  107.     @staticmethod
  108.     def dname_col(df, col_index):
  109.         '''
  110.        get name from column index
  111.        '''
  112.         for i, col_name in enumerate(df.cols()):
  113.             if i == col_index:
  114.                 return col_name
  115.         raise Exception('not found name for col index: {}'.format(col_index))
  116.  
  117.     @staticmethod
  118.     def dname_row(df, row_index):
  119.         '''
  120.        get name from row index
  121.        '''
  122.         for i, row_name in enumerate(df.rows()):
  123.             if i == row_index:
  124.                 return row_name
  125.         raise Exception('not found name for row index: {}'.format(row_index))
  126.  
  127.     @staticmethod
  128.     def dswap_col(df, a, b):
  129.         '''
  130.        swap col a and b
  131.        a, b are indexes
  132.        '''
  133.         newdf = df.copy()
  134.         assert(a < len(newdf.cols()))
  135.         assert(b < len(newdf.cols()))
  136.         nc = list(newdf.cols())
  137.         nc[a], nc[b] = nc[b], nc[a]
  138.         newdf._names = tuple(nc)
  139.         for k,v in newdf.iteritems():
  140.             l = list(v)
  141.             l[a], l[b] = l[b], l[a]
  142.             newdf[k] = tuple(l)
  143.         return newdf
  144.  
  145.     @staticmethod
  146.     def dswap_row(df, a, b):
  147.         newdf = dataframe()
  148.         newdf._names = copy.deepcopy(df.cols())
  149.         i = 0
  150.         for k,v in df.iteritems():
  151.             if (i != a) and (i != b):
  152.                 newdf[k] = v
  153.             else:
  154.                 if i == a:
  155.                     k2 = dataframe.dname_row(df, b)
  156.                 else:
  157.                     assert(i == b)
  158.                     k2 = dataframe.dname_row(df, a)
  159.                 newdf[k2] = df[k2]
  160.             i += 1
  161.         return newdf
  162.  
  163.     @staticmethod
  164.     def _partition(df, array, sort, reverse, begin, end):
  165.         assert(df is not None)
  166.         pivot = begin
  167.         for i in xrange(begin+1, end+1):
  168.             if bool(array[i] <= array[begin] and not reverse) ^ bool(array[i] > array[begin] and reverse):
  169.                 pivot += 1
  170.                 array[i], array[pivot] = array[pivot], array[i]
  171.                 if sort == 'row':
  172.                     df = dataframe.dswap_col(df, i, pivot)
  173.                 elif sort == 'col':
  174.                     df = dataframe.dswap_row(df, i, pivot)
  175.                 else:
  176.                     raise Exception('invalid sort mode: {}'.format(sort))
  177.         array[pivot], array[begin] = array[begin], array[pivot]
  178.         if sort == 'row':
  179.             df = dataframe.dswap_col(df, begin, pivot)
  180.         elif sort == 'col':
  181.             df = dataframe.dswap_row(df, begin, pivot)
  182.         else:
  183.             raise Exception('invalid sort mode: {}'.format(sort))
  184.         return df, pivot
  185.  
  186.     @staticmethod
  187.     def _quicksort(df, array, sort='row', reverse=False, begin=0, end=None):
  188.         assert(df is not None)
  189.         if end is None:
  190.             end = len(array) - 1
  191.         if begin >= end:
  192.             return df
  193.         df, pivot = dataframe._partition(df, array, sort, reverse, begin, end)
  194.         df = dataframe._quicksort(df, array, sort, reverse, begin, pivot-1)
  195.         df = dataframe._quicksort(df, array, sort, reverse, pivot+1, end)
  196.         return df
  197.  
  198.     @staticmethod
  199.     def get_row(df, row_name):
  200.         '''
  201.        return row in tuple
  202.        '''
  203.         return df[row_name]
  204.  
  205.     @staticmethod
  206.     def get_col(df, col_name):
  207.         '''
  208.        return col in list
  209.        '''
  210.         l = []
  211.         col_index = dataframe.dindex_col(df, col_name)
  212.         for k,v in df.iteritems():
  213.             l.append( v[col_index] )
  214.         return l
  215.  
  216.     @staticmethod
  217.     def dsort_row(df, row_name, reverse=False):
  218.         '''
  219.        use dswap_col
  220.        row is index
  221.        '''
  222.         newdf = df.copy()
  223.         return dataframe._quicksort(newdf, list(dataframe.get_row(newdf, row_name)), sort='row', reverse=reverse)
  224.  
  225.     @staticmethod
  226.     def dsort_col(df, col_name, reverse=False):
  227.         '''
  228.        use dswap_row
  229.        col is index
  230.        '''
  231.         newdf = df.copy()
  232.         return dataframe._quicksort(newdf, list(dataframe.get_col(newdf, col_name)), sort='col', reverse=reverse)
  233.  
  234.     @staticmethod
  235.     def dsort_row_index(df, row, reverse=False):
  236.         '''
  237.        use dswap_col
  238.        row is index
  239.        '''
  240.         newdf = df.copy()
  241.         return dataframe.dsort_row(newdf, dataframe.dname_row(newdf, row), reverse=reverse )
  242.  
  243.     @staticmethod
  244.     def dsort_col_index(df, col, reverse=False):
  245.         '''
  246.        use dswap_row
  247.        col is index
  248.        '''
  249.         newdf = df.copy()
  250.         return dataframe.dsort_col(newdf, dataframe.dname_col(newdf, col), reverse=reverse )
  251.  
  252.     @staticmethod
  253.     def dorder(df, new_order, reverse=False):
  254.         newdf = df.copy()
  255.         indexes = [0 for _ in range(len(newdf.cols()))]
  256.         assert(len(indexes) == len(newdf.cols()))
  257.         for i, col_to_move in enumerate(new_order):
  258.             try:
  259.                 idx = dataframe.dindex_col(newdf, col_to_move)
  260.                 indexes[ idx ] = i
  261.             except Exception:
  262.                 logging.warning('no found column: {}'.format(col_to_move))
  263.         return dataframe._quicksort(newdf, indexes, sort='row', reverse=reverse)
  264.  
  265.     @staticmethod
  266.     def dremove_cols(df, cols_remove):
  267.         '''
  268.        cols_remove is a list with names to remove cols
  269.        '''
  270.         indexes = []
  271.         for col_name in cols_remove:
  272.             indexes.append( dataframe.dindex_col(df, col_name) )
  273.         # iterate index in sort reverse order (for avoid index change while removing elements)
  274.         indexes.sort(reverse=True)
  275.         newdf = dataframe()
  276.         c = list(df.cols())
  277.         for i in indexes:
  278.             del c[i]
  279.         newdf._names = tuple(c)
  280.         for k,v in df.iteritems():
  281.             l = list(v)
  282.             for i in indexes:
  283.                 del l[i]
  284.             newdf[k] = tuple(l)
  285.         return newdf
  286.  
  287.     @staticmethod
  288.     def dremove_rows(df, rows_remove):
  289.         '''
  290.        rows_remove is a list with key names to remove rows
  291.        '''
  292.         indexes = []
  293.         for row_name in rows_remove:
  294.             indexes.append( dataframe.dindex_row(df, row_name) )
  295.         # iterate index in sort reverse order (for avoid index change while removing elements)
  296.         indexes.sort(reverse=True)
  297.         newdf = df.copy()
  298.         idx = len(newdf) - 1
  299.         for k in reversed(df):
  300.             if idx in indexes:
  301.                 del newdf[k]
  302.             idx -= 1
  303.         return newdf
  304.  
  305.     @staticmethod
  306.     def dremove_cols_if_row(df, row_name, functor):
  307.         '''
  308.        parm df: dataframe
  309.        parm col_name: name row to check
  310.        parm functor: apply functor in each row value and remove cols in positive case
  311.        '''
  312.         newdf = df.copy()
  313.         i = 0
  314.         try:
  315.             for v in newdf[row_name]:
  316.                 if functor(v):
  317.                     newdf = dataframe.dremove_cols(newdf, [ dataframe.dname_col(newdf, i) ])
  318.                 else:
  319.                     i += 1
  320.         except KeyError:
  321.             logging.warning('no found row: {}'.format(row_name))
  322.         return newdf
  323.  
  324.     @staticmethod
  325.     def dremove_rows_if_col(df, col_name, functor):
  326.         '''
  327.        parm df: dataframe
  328.        parm col_name: name col to check
  329.        parm functor: apply functor in each col value and remove rows in positive case
  330.        '''
  331.         newdf = df.copy()
  332.         for k,v in newdf.iteritems():
  333.             if functor(v[dataframe.dindex_col(newdf, col_name)]):
  334.                 newdf = dataframe.dremove_rows(newdf, [k])
  335.         return newdf
  336.  
  337.     @staticmethod
  338.     def writef(f, text):
  339.         text = text.replace("'", '"')
  340.         f.write('{}\n'.format(text))
  341.  
  342.     @staticmethod
  343.     def to_json(df, filename, x='name'):
  344.         with open(filename, 'wt') as f:
  345.             dataframe.writef(f, '[')
  346.             dataframe.writef(f, '\t{},'.format([x] + list(df.cols())))
  347.             i = 0
  348.             for k,v in df.iteritems():
  349.                 if i != len(df) - 1:
  350.                     sep = ','
  351.                 else:
  352.                     sep = ''
  353.                 dataframe.writef(f, '\t{}{}'.format([k] + list(v), sep))
  354.                 i += 1
  355.             dataframe.writef(f, ']')
  356.  
  357. if __name__ == '__main__':
  358.     dataset = []
  359.     for r in range(10):
  360.         for c in range(10):
  361.             dataset.append( ('row{}'.format(r), 'col{}'.format(c), r+c) )
  362.     dataset_pivot = dataframe.dcast(dataset, row=0, col=1, value=2, functor=lambda x,y: x + y)
  363.     dataset_pivot = dataframe.dsort_col(dataset_pivot, 'col9', reverse=True)
  364.     dataset_pivot = dataframe.dremove_cols(dataset_pivot, ['col4', 'col5', 'col6', 'col7', 'col8', 'col9'])
  365.     dataset_pivot = dataframe.dorder(dataset_pivot, ['col0', 'col2', 'col1', 'col3'])
  366.     new_dataset = dataframe.melt(dataset_pivot)
  367.     new_dataset_pivot = dataframe.dcast(new_dataset, row=0, col=1, value=2)
  368.     assert(dataset_pivot == new_dataset_pivot)
  369.     new_dataset_pivot = dataframe.dremove_rows_if_col(new_dataset_pivot, 'col1', lambda x: x < 6.0)
  370.     new_dataset_pivot = dataframe.dremove_cols_if_row(new_dataset_pivot, 'row7', lambda x: x > 8.0)
  371.     new_dataset = dataframe.melt(dataset_pivot)
  372.     for r, c, v in new_dataset:
  373.         print '{} - {} - {}'.format(r, c, v)
RAW Paste Data