makiolo

python pivot table

Feb 16th, 2017
130
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import copy
  2. import logging
  3. import fnmatch
  4. from collections import OrderedDict
  5.  
  6. class dataframe(OrderedDict):
  7.     def __init__(self, l=[]):
  8.         super(dataframe, self).__init__(l)
  9.         self._names = tuple()
  10.  
  11.     def copy(self):
  12.         df = super(dataframe, self).copy()
  13.         df._names = copy.deepcopy(self.cols())
  14.         return df
  15.  
  16.     def cols(self):
  17.         return self._names
  18.  
  19.     def rows(self):
  20.         return tuple(self.keys())
  21.  
  22.     def __str__(self):
  23.         buff = '\t'
  24.         buff += '\t'.join(self.cols())
  25.         buff += '\n'
  26.         for k,values in self.iteritems():
  27.             buff += '{}\t'.format(k)
  28.             for v in values:
  29.                 buff += '{}\t'.format(v)
  30.             buff += '\n'
  31.         return buff
  32.  
  33.     @staticmethod
  34.     def dcast(dataset, row, col, value, functor = lambda x,y: x+y, neutral=0.0):
  35.         '''
  36.        casting to wide format (similar to dcast in reshape2@R)
  37.  
  38.        dataset is a list of tuples
  39.        row col and value are index of tuple
  40.        func aggregation is sum() in hardcoded way
  41.        '''
  42.         col_uniques = []
  43.         row_uniques = []
  44.         for tupl in dataset:
  45.             if tupl[col] not in col_uniques:
  46.                 col_uniques.append(tupl[col])
  47.             if tupl[row] not in row_uniques:
  48.                 row_uniques.append(tupl[row])
  49.         result = dataframe()
  50.         names = []
  51.         for tupl_col in col_uniques:
  52.             names.append(tupl_col)
  53.         result._names = tuple(names)
  54.         for tupl_row in row_uniques:
  55.             row_list = []
  56.             for tupl_col in col_uniques:
  57.                 total = []
  58.                 for tupl in dataset:
  59.                     if (tupl[row] == tupl_row) and (tupl[col] == tupl_col):
  60.                         total.append( float(tupl[value]) )
  61.                 if len(total) > 0:
  62.                     reduced = reduce(functor, total)
  63.                 else:
  64.                     reduced = neutral
  65.                 row_list.append(reduced)
  66.             result[tupl_row] = tuple(row_list)
  67.         return result
  68.  
  69.     def melt(self, permutation=False):
  70.         '''
  71.        undo pivot table
  72.  
  73.        return a list of tuples (row, col, value) or (col, row, value) (with permutation=True)
  74.        '''
  75.         dataset = []
  76.         for k, values in self.iteritems():
  77.             for c, v in enumerate(values):
  78.                 if not permutation:
  79.                     dataset.append( (k, self.cols()[c], v) )
  80.                 else:
  81.                     dataset.append( (self.cols()[c], k, v) )
  82.         return dataset
  83.  
  84.     @staticmethod
  85.     def dindex_col(df, col_name):
  86.         '''
  87.        get index from column name
  88.        '''
  89.         for i in range(len(df.cols())):
  90.             if df.cols()[i] == col_name:
  91.                 break
  92.         else:
  93.             raise Exception('col name: {} is not found'.format(col_name))
  94.         return i
  95.  
  96.     @staticmethod
  97.     def dindex_row(df, row_name):
  98.         '''
  99.        get index from row name
  100.        '''
  101.         for i in range(len(df.rows())):
  102.             if df.rows()[i] == row_name:
  103.                 break
  104.         else:
  105.             raise Exception('row name: {} is not found'.format(row_name))
  106.         return i
  107.  
  108.     @staticmethod
  109.     def dname_col(df, col_index):
  110.         '''
  111.        get name from column index
  112.        '''
  113.         for i, col_name in enumerate(df.cols()):
  114.             if i == col_index:
  115.                 return col_name
  116.         raise Exception('not found name for col index: {}'.format(col_index))
  117.  
  118.     @staticmethod
  119.     def dname_row(df, row_index):
  120.         '''
  121.        get name from row index
  122.        '''
  123.         for i, row_name in enumerate(df.rows()):
  124.             if i == row_index:
  125.                 return row_name
  126.         raise Exception('not found name for row index: {}'.format(row_index))
  127.  
  128.     @staticmethod
  129.     def dswap_col(df, a, b):
  130.         '''
  131.        swap col a and b
  132.        a, b are indexes
  133.        '''
  134.         assert(a < len(df.cols()))
  135.         assert(b < len(df.cols()))
  136.         nc = list(df.cols())
  137.         nc[a], nc[b] = nc[b], nc[a]
  138.         df._names = tuple(nc)
  139.         for k,v in df.iteritems():
  140.             l = list(v)
  141.             l[a], l[b] = l[b], l[a]
  142.             df[k] = tuple(l)
  143.         return df
  144.  
  145.     @staticmethod
  146.     def dswap_row(df, a, b):
  147.         newdf = dataframe()
  148.         newdf._names = copy.deepcopy(df.cols())
  149.         i = 0
  150.         for k,v in df.iteritems():
  151.             if (i != a) and (i != b):
  152.                 newdf[k] = v
  153.             else:
  154.                 if i == a:
  155.                     k2 = dataframe.dname_row(df, b)
  156.                 else:
  157.                     assert(i == b)
  158.                     k2 = dataframe.dname_row(df, a)
  159.                 newdf[k2] = df[k2]
  160.             i += 1
  161.         return newdf
  162.  
  163.     @staticmethod
  164.     def _partition(df, array, sort, reverse, begin, end):
  165.         assert(df is not None)
  166.         pivot = begin
  167.         for i in xrange(begin+1, end+1):
  168.             if bool(array[i] <= array[begin] and not reverse) ^ bool(array[i] > array[begin] and reverse):
  169.                 pivot += 1
  170.                 array[i], array[pivot] = array[pivot], array[i]
  171.                 if sort == 'row':
  172.                     df = dataframe.dswap_col(df, i, pivot)
  173.                 elif sort == 'col':
  174.                     df = dataframe.dswap_row(df, i, pivot)
  175.                 else:
  176.                     raise Exception('invalid sort mode: {}'.format(sort))
  177.         array[pivot], array[begin] = array[begin], array[pivot]
  178.         if sort == 'row':
  179.             df = dataframe.dswap_col(df, begin, pivot)
  180.         elif sort == 'col':
  181.             df = dataframe.dswap_row(df, begin, pivot)
  182.         else:
  183.             raise Exception('invalid sort mode: {}'.format(sort))
  184.         return df, pivot
  185.  
  186.     @staticmethod
  187.     def _quicksort(df, array, sort='row', reverse=False, begin=0, end=None):
  188.         assert(df is not None)
  189.         if end is None:
  190.             end = len(array) - 1
  191.         if begin >= end:
  192.             return df
  193.         df, pivot = dataframe._partition(df, array, sort, reverse, begin, end)
  194.         df = dataframe._quicksort(df, array, sort, reverse, begin, pivot-1)
  195.         df = dataframe._quicksort(df, array, sort, reverse, pivot+1, end)
  196.         return df
  197.  
  198.     @staticmethod
  199.     def get_row(df, row_name):
  200.         '''
  201.        return row in tuple
  202.        '''
  203.         return df[row_name]
  204.  
  205.     @staticmethod
  206.     def get_col(df, col_name):
  207.         '''
  208.        return col in list
  209.        '''
  210.         l = []
  211.         col_index = dataframe.dindex_col(df, col_name)
  212.         for k,v in df.iteritems():
  213.             l.append( v[col_index] )
  214.         return l
  215.  
  216.     @staticmethod
  217.     def dsort_row(df, row_name, reverse=False):
  218.         '''
  219.        use dswap_col
  220.        row is index
  221.        '''
  222.         return dataframe._quicksort(df, list(dataframe.get_row(df, row_name)), sort='row', reverse=reverse)
  223.  
  224.     @staticmethod
  225.     def dsort_col(df, col_name, reverse=False):
  226.         '''
  227.        use dswap_row
  228.        col is index
  229.        '''
  230.         return dataframe._quicksort(df, list(dataframe.get_col(df, col_name)), sort='col', reverse=reverse)
  231.  
  232.     @staticmethod
  233.     def dsort_row_index(df, row, reverse=False):
  234.         '''
  235.        use dswap_col
  236.        row is index
  237.        '''
  238.         return dataframe.dsort_row(df, dataframe.dname_row(df, row), reverse=reverse )
  239.  
  240.     @staticmethod
  241.     def dsort_col_index(df, col, reverse=False):
  242.         '''
  243.        use dswap_row
  244.        col is index
  245.        '''
  246.         return dataframe.dsort_col(df, dataframe.dname_col(df, col), reverse=reverse )
  247.  
  248.     @staticmethod
  249.     def dorder(df, new_order, reverse=False):
  250.         indexes = [0 for _ in range(len(df.cols()))]
  251.         assert(len(indexes) == len(df.cols()))
  252.         for i, col_to_move in enumerate(new_order):
  253.             try:
  254.                 idx = dataframe.dindex_col(df, col_to_move)
  255.                 indexes[ idx ] = i
  256.             except Exception:
  257.                 logging.warning('no found column: {}'.format(col_to_move))
  258.         return dataframe._quicksort(df, indexes, sort='row', reverse=reverse)
  259.  
  260.     @staticmethod
  261.     def dremove_cols(df, cols_remove):
  262.         '''
  263.        cols_remove is a list with names to remove cols
  264.        '''
  265.         indexes = []
  266.         for col_name in cols_remove:
  267.             indexes.append( dataframe.dindex_col(df, col_name) )
  268.         # iterate index in sort reverse order (for avoid index change while removing elements)
  269.         indexes.sort(reverse=True)
  270.         c = list(df.cols())
  271.         for i in indexes:
  272.             del c[i]
  273.         df._names = tuple(c)
  274.         for k,v in df.iteritems():
  275.             l = list(v)
  276.             for i in indexes:
  277.                 del l[i]
  278.             df[k] = tuple(l)
  279.         return df
  280.  
  281.     @staticmethod
  282.     def dremove_col(df, pattern):
  283.         '''
  284.        remove column name by globing
  285.        '''
  286.         cols_remove = []
  287.         for col_name in df.cols():
  288.             if fnmatch.fnmatch(col_name, pattern):
  289.                 cols_remove.append( col_name )
  290.         if len(cols_remove) > 0:
  291.             return dataframe.dremove_cols(df, cols_remove)
  292.         else:
  293.             return df
  294.    
  295.     @staticmethod
  296.     def dremove_rows(df, rows_remove):
  297.         '''
  298.        rows_remove is a list with key names to remove rows
  299.        '''
  300.         indexes = []
  301.         for row_name in rows_remove:
  302.             indexes.append( dataframe.dindex_row(df, row_name) )
  303.         # iterate index in sort reverse order (for avoid index change while removing elements)
  304.         indexes.sort(reverse=True)
  305.         idx = len(df) - 1
  306.         for k in reversed(df):
  307.             if idx in indexes:
  308.                 logging.debug('removing row {}'.format(k))
  309.                 del df[k]
  310.             idx -= 1
  311.         return df
  312.  
  313.     @staticmethod
  314.     def dremove_row(df, pattern):
  315.         '''
  316.        remove rows by name
  317.        '''
  318.         rows_remove = []
  319.         for row_name in df.rows():
  320.             if fnmatch.fnmatch(row_name, pattern):
  321.                 rows_remove.append( row_name )
  322.         if len(rows_remove) > 0:
  323.             return dataframe.dremove_rows(df, rows_remove)
  324.         else:
  325.             return df
  326.  
  327.     @staticmethod
  328.     def dremove_cols_if_row(df, row_name, functor):
  329.         '''
  330.        parm df: dataframe
  331.        parm col_name: name row to check
  332.        parm functor: apply functor in each row value and remove cols in positive case
  333.        '''
  334.         i = 0
  335.         try:
  336.             for v in df[row_name]:
  337.                 if functor(v):
  338.                     df = dataframe.dremove_cols(df, [ dataframe.dname_col(df, i) ])
  339.                 else:
  340.                     i += 1
  341.         except KeyError:
  342.             logging.warning('no found row: {}'.format(row_name))
  343.         return df
  344.  
  345.     @staticmethod
  346.     def dremove_rows_if_col(df, col_name, functor):
  347.         '''
  348.        parm df: dataframe
  349.        parm col_name: name col to check
  350.        parm functor: apply functor in each col value and remove rows in positive case
  351.        '''
  352.         for k,v in df.iteritems():
  353.             if functor(v[dataframe.dindex_col(df, col_name)]):
  354.                 df = dataframe.dremove_rows(df, [k])
  355.         return df
  356.  
  357.     @staticmethod
  358.     def writef(f, text):
  359.         text = text.replace("'", '"')
  360.         f.write('{}\n'.format(text))
  361.  
  362.     @staticmethod
  363.     def to_json(df, filename, x='name'):
  364.         with open(filename, 'wt') as f:
  365.             dataframe.writef(f, '[')
  366.             dataframe.writef(f, '\t{},'.format([x] + list(df.cols())))
  367.             i = 0
  368.             for k,v in df.iteritems():
  369.                 if i != len(df) - 1:
  370.                     sep = ','
  371.                 else:
  372.                     sep = ''
  373.                 dataframe.writef(f, '\t{}{}'.format([k] + list(v), sep))
  374.                 i += 1
  375.             dataframe.writef(f, ']')
  376.  
  377. if __name__ == '__main__':
  378.     dataset = []
  379.     for r in range(10):
  380.         for c in range(10):
  381.             dataset.append( ('row{}'.format(r), 'col{}'.format(c), r+c) )
  382.     dataset_pivot = dataframe.dcast(dataset, row=0, col=1, value=2, functor=lambda x,y: x + y)
  383.     dataset_pivot = dataframe.dsort_col(dataset_pivot, 'col9', reverse=True)
  384.     dataset_pivot = dataframe.dremove_cols(dataset_pivot, ['col4', 'col5', 'col6', 'col7', 'col8', 'col9'])
  385.     dataset_pivot = dataframe.dorder(dataset_pivot, ['col0', 'col2', 'col1', 'col3'])
  386.     new_dataset = dataframe.melt(dataset_pivot)
  387.     new_dataset_pivot = dataframe.dcast(new_dataset, row=0, col=1, value=2)
  388.     assert(dataset_pivot == new_dataset_pivot)
  389.     new_dataset_pivot = dataframe.dremove_rows_if_col(new_dataset_pivot, 'col1', lambda x: x < 6.0)
  390.     new_dataset_pivot = dataframe.dremove_cols_if_row(new_dataset_pivot, 'row7', lambda x: x > 8.0)
  391.     new_dataset = dataframe.melt(dataset_pivot)
  392.     for r, c, v in new_dataset:
  393.         print '{} - {} - {}'.format(r, c, v)
RAW Paste Data