Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import copy
- import logging
- import fnmatch
- from collections import OrderedDict
- class dataframe(OrderedDict):
- def __init__(self, l=[]):
- super(dataframe, self).__init__(l)
- self._names = tuple()
- def copy(self):
- df = super(dataframe, self).copy()
- df._names = copy.deepcopy(self.cols())
- return df
- def cols(self):
- return self._names
- def rows(self):
- return tuple(self.keys())
- def __str__(self):
- buff = '\t'
- buff += '\t'.join(self.cols())
- buff += '\n'
- for k,values in self.iteritems():
- buff += '{}\t'.format(k)
- for v in values:
- buff += '{}\t'.format(v)
- buff += '\n'
- return buff
- @staticmethod
- def dcast(dataset, row, col, value, functor = lambda x,y: x+y, neutral=0.0):
- '''
- casting to wide format (similar to dcast in reshape2@R)
- dataset is a list of tuples
- row col and value are index of tuple
- func aggregation is sum() in hardcoded way
- '''
- col_uniques = []
- row_uniques = []
- for tupl in dataset:
- if tupl[col] not in col_uniques:
- col_uniques.append(tupl[col])
- if tupl[row] not in row_uniques:
- row_uniques.append(tupl[row])
- result = dataframe()
- names = []
- for tupl_col in col_uniques:
- names.append(tupl_col)
- result._names = tuple(names)
- for tupl_row in row_uniques:
- row_list = []
- for tupl_col in col_uniques:
- total = []
- for tupl in dataset:
- if (tupl[row] == tupl_row) and (tupl[col] == tupl_col):
- total.append( float(tupl[value]) )
- if len(total) > 0:
- reduced = reduce(functor, total)
- else:
- reduced = neutral
- row_list.append(reduced)
- result[tupl_row] = tuple(row_list)
- return result
- def melt(self, permutation=False):
- '''
- undo pivot table
- return a list of tuples (row, col, value) or (col, row, value) (with permutation=True)
- '''
- dataset = []
- for k, values in self.iteritems():
- for c, v in enumerate(values):
- if not permutation:
- dataset.append( (k, self.cols()[c], v) )
- else:
- dataset.append( (self.cols()[c], k, v) )
- return dataset
- @staticmethod
- def dindex_col(df, col_name):
- '''
- get index from column name
- '''
- for i in range(len(df.cols())):
- if df.cols()[i] == col_name:
- break
- else:
- raise Exception('col name: {} is not found'.format(col_name))
- return i
- @staticmethod
- def dindex_row(df, row_name):
- '''
- get index from row name
- '''
- for i in range(len(df.rows())):
- if df.rows()[i] == row_name:
- break
- else:
- raise Exception('row name: {} is not found'.format(row_name))
- return i
- @staticmethod
- def dname_col(df, col_index):
- '''
- get name from column index
- '''
- for i, col_name in enumerate(df.cols()):
- if i == col_index:
- return col_name
- raise Exception('not found name for col index: {}'.format(col_index))
- @staticmethod
- def dname_row(df, row_index):
- '''
- get name from row index
- '''
- for i, row_name in enumerate(df.rows()):
- if i == row_index:
- return row_name
- raise Exception('not found name for row index: {}'.format(row_index))
- @staticmethod
- def dswap_col(df, a, b):
- '''
- swap col a and b
- a, b are indexes
- '''
- assert(a < len(df.cols()))
- assert(b < len(df.cols()))
- nc = list(df.cols())
- nc[a], nc[b] = nc[b], nc[a]
- df._names = tuple(nc)
- for k,v in df.iteritems():
- l = list(v)
- l[a], l[b] = l[b], l[a]
- df[k] = tuple(l)
- return df
- @staticmethod
- def dswap_row(df, a, b):
- newdf = dataframe()
- newdf._names = copy.deepcopy(df.cols())
- i = 0
- for k,v in df.iteritems():
- if (i != a) and (i != b):
- newdf[k] = v
- else:
- if i == a:
- k2 = dataframe.dname_row(df, b)
- else:
- assert(i == b)
- k2 = dataframe.dname_row(df, a)
- newdf[k2] = df[k2]
- i += 1
- return newdf
- @staticmethod
- def _partition(df, array, sort, reverse, begin, end):
- assert(df is not None)
- pivot = begin
- for i in xrange(begin+1, end+1):
- if bool(array[i] <= array[begin] and not reverse) ^ bool(array[i] > array[begin] and reverse):
- pivot += 1
- array[i], array[pivot] = array[pivot], array[i]
- if sort == 'row':
- df = dataframe.dswap_col(df, i, pivot)
- elif sort == 'col':
- df = dataframe.dswap_row(df, i, pivot)
- else:
- raise Exception('invalid sort mode: {}'.format(sort))
- array[pivot], array[begin] = array[begin], array[pivot]
- if sort == 'row':
- df = dataframe.dswap_col(df, begin, pivot)
- elif sort == 'col':
- df = dataframe.dswap_row(df, begin, pivot)
- else:
- raise Exception('invalid sort mode: {}'.format(sort))
- return df, pivot
- @staticmethod
- def _quicksort(df, array, sort='row', reverse=False, begin=0, end=None):
- assert(df is not None)
- if end is None:
- end = len(array) - 1
- if begin >= end:
- return df
- df, pivot = dataframe._partition(df, array, sort, reverse, begin, end)
- df = dataframe._quicksort(df, array, sort, reverse, begin, pivot-1)
- df = dataframe._quicksort(df, array, sort, reverse, pivot+1, end)
- return df
- @staticmethod
- def get_row(df, row_name):
- '''
- return row in tuple
- '''
- return df[row_name]
- @staticmethod
- def get_col(df, col_name):
- '''
- return col in list
- '''
- l = []
- col_index = dataframe.dindex_col(df, col_name)
- for k,v in df.iteritems():
- l.append( v[col_index] )
- return l
- @staticmethod
- def dsort_row(df, row_name, reverse=False):
- '''
- use dswap_col
- row is index
- '''
- return dataframe._quicksort(df, list(dataframe.get_row(df, row_name)), sort='row', reverse=reverse)
- @staticmethod
- def dsort_col(df, col_name, reverse=False):
- '''
- use dswap_row
- col is index
- '''
- return dataframe._quicksort(df, list(dataframe.get_col(df, col_name)), sort='col', reverse=reverse)
- @staticmethod
- def dsort_row_index(df, row, reverse=False):
- '''
- use dswap_col
- row is index
- '''
- return dataframe.dsort_row(df, dataframe.dname_row(df, row), reverse=reverse )
- @staticmethod
- def dsort_col_index(df, col, reverse=False):
- '''
- use dswap_row
- col is index
- '''
- return dataframe.dsort_col(df, dataframe.dname_col(df, col), reverse=reverse )
- @staticmethod
- def dorder(df, new_order, reverse=False):
- indexes = [0 for _ in range(len(df.cols()))]
- assert(len(indexes) == len(df.cols()))
- for i, col_to_move in enumerate(new_order):
- try:
- idx = dataframe.dindex_col(df, col_to_move)
- indexes[ idx ] = i
- except Exception:
- logging.warning('no found column: {}'.format(col_to_move))
- return dataframe._quicksort(df, indexes, sort='row', reverse=reverse)
- @staticmethod
- def dremove_cols(df, cols_remove):
- '''
- cols_remove is a list with names to remove cols
- '''
- indexes = []
- for col_name in cols_remove:
- indexes.append( dataframe.dindex_col(df, col_name) )
- # iterate index in sort reverse order (for avoid index change while removing elements)
- indexes.sort(reverse=True)
- c = list(df.cols())
- for i in indexes:
- del c[i]
- df._names = tuple(c)
- for k,v in df.iteritems():
- l = list(v)
- for i in indexes:
- del l[i]
- df[k] = tuple(l)
- return df
- @staticmethod
- def dremove_col(df, pattern):
- '''
- remove column name by globing
- '''
- cols_remove = []
- for col_name in df.cols():
- if fnmatch.fnmatch(col_name, pattern):
- cols_remove.append( col_name )
- if len(cols_remove) > 0:
- return dataframe.dremove_cols(df, cols_remove)
- else:
- return df
- @staticmethod
- def dremove_rows(df, rows_remove):
- '''
- rows_remove is a list with key names to remove rows
- '''
- indexes = []
- for row_name in rows_remove:
- indexes.append( dataframe.dindex_row(df, row_name) )
- # iterate index in sort reverse order (for avoid index change while removing elements)
- indexes.sort(reverse=True)
- idx = len(df) - 1
- for k in reversed(df):
- if idx in indexes:
- logging.debug('removing row {}'.format(k))
- del df[k]
- idx -= 1
- return df
- @staticmethod
- def dremove_row(df, pattern):
- '''
- remove rows by name
- '''
- rows_remove = []
- for row_name in df.rows():
- if fnmatch.fnmatch(row_name, pattern):
- rows_remove.append( row_name )
- if len(rows_remove) > 0:
- return dataframe.dremove_rows(df, rows_remove)
- else:
- return df
- @staticmethod
- def dremove_cols_if_row(df, row_name, functor):
- '''
- parm df: dataframe
- parm col_name: name row to check
- parm functor: apply functor in each row value and remove cols in positive case
- '''
- i = 0
- try:
- for v in df[row_name]:
- if functor(v):
- df = dataframe.dremove_cols(df, [ dataframe.dname_col(df, i) ])
- else:
- i += 1
- except KeyError:
- logging.warning('no found row: {}'.format(row_name))
- return df
- @staticmethod
- def dremove_rows_if_col(df, col_name, functor):
- '''
- parm df: dataframe
- parm col_name: name col to check
- parm functor: apply functor in each col value and remove rows in positive case
- '''
- for k,v in df.iteritems():
- if functor(v[dataframe.dindex_col(df, col_name)]):
- df = dataframe.dremove_rows(df, [k])
- return df
- @staticmethod
- def writef(f, text):
- text = text.replace("'", '"')
- f.write('{}\n'.format(text))
- @staticmethod
- def to_json(df, filename, x='name'):
- with open(filename, 'wt') as f:
- dataframe.writef(f, '[')
- dataframe.writef(f, '\t{},'.format([x] + list(df.cols())))
- i = 0
- for k,v in df.iteritems():
- if i != len(df) - 1:
- sep = ','
- else:
- sep = ''
- dataframe.writef(f, '\t{}{}'.format([k] + list(v), sep))
- i += 1
- dataframe.writef(f, ']')
- if __name__ == '__main__':
- dataset = []
- for r in range(10):
- for c in range(10):
- dataset.append( ('row{}'.format(r), 'col{}'.format(c), r+c) )
- dataset_pivot = dataframe.dcast(dataset, row=0, col=1, value=2, functor=lambda x,y: x + y)
- dataset_pivot = dataframe.dsort_col(dataset_pivot, 'col9', reverse=True)
- dataset_pivot = dataframe.dremove_cols(dataset_pivot, ['col4', 'col5', 'col6', 'col7', 'col8', 'col9'])
- dataset_pivot = dataframe.dorder(dataset_pivot, ['col0', 'col2', 'col1', 'col3'])
- new_dataset = dataframe.melt(dataset_pivot)
- new_dataset_pivot = dataframe.dcast(new_dataset, row=0, col=1, value=2)
- assert(dataset_pivot == new_dataset_pivot)
- new_dataset_pivot = dataframe.dremove_rows_if_col(new_dataset_pivot, 'col1', lambda x: x < 6.0)
- new_dataset_pivot = dataframe.dremove_cols_if_row(new_dataset_pivot, 'row7', lambda x: x > 8.0)
- new_dataset = dataframe.melt(dataset_pivot)
- for r, c, v in new_dataset:
- print '{} - {} - {}'.format(r, c, v)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement