Advertisement
Guest User

tabulardata.py

a guest
Jan 18th, 2013
18
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 20.00 KB | None | 0 0
  1. # Encoding: UTF-8
  2. """
  3. Module for working with tabular data. Tabular data in this context means data in
  4. a column and row format.
  5. """
  6. import sys
  7. import os
  8. import math
  9.  
  10. import texttable
  11.  
  12. import broadpy.clitools as clitools
  13. from broadpy.clitools import console_width
  14.  
  15. # Mapping between supported file formats and their respective load/save
  16. # functions.
  17. LOAD_EXT_MAPPING = {
  18.     'xlsx': 'load_xlsx_file'
  19. }
  20. SAVE_EXT_MAPPING = {
  21.     'xlsx': 'save_xlsx_file'
  22. }
  23.  
  24. # Some error constants
  25. ERROR_UNSUPPORTED_FILE_EXTENSION = 0x0001
  26. ERROR_DUPLICATE_COLNAME = 0x0002
  27.  
  28. class Table(object):
  29.     """A class for representing a table."""
  30.  
  31.     def __init__(self):
  32.         # This variable is used to remember which row of the file is the first
  33.         # row imported. This is to maintain the correct row indexes when
  34.         # printing the table.
  35.         self.from_row = 0
  36.  
  37.         # Lists for the columns and rows.
  38.         self.columns = []
  39.         self.rows = []
  40.  
  41.     def _stringify_value(self, value):
  42.         if type(value) in [str, unicode, int, float, long]:
  43.             return unicode(value)
  44.         elif value in [None, True, False]:
  45.             return u''
  46.         else:
  47.             raise TypeError, 'Unknown type: {}'.format(type(value))
  48.  
  49.     def get(self, index):
  50.         """Get the row at `index` as an object with each column as a key"""
  51.         r = {}
  52.  
  53.         # Iterate through each cell of the row at `index` and assign it to the
  54.         # object at the column of the index of the cell.
  55.         for i, cell in enumerate(self.rows[index]):
  56.             r[self.columns[i]] = cell
  57.  
  58.         return r
  59.  
  60.     def get_cell_value(self, row, col):
  61.         return self.rows[row][col]
  62.  
  63.     def set_cell_value(self, row, col, value):
  64.         self.rows[row][col] = value
  65.  
  66.     def get_row_count(self):
  67.         return len(self.rows)
  68.  
  69.     def get_column_count(self):
  70.         return len(self.columns)
  71.  
  72.     def get_column_header(self, index):
  73.         return self.columns[index]
  74.  
  75.     def get_column_index(self, label):
  76.         """Get the index of a column by the header."""
  77.         return self.columns.index(label)
  78.  
  79.     def get_column_headers(self):
  80.         """Returns a list containing all the column headers in order."""
  81.         return self.columns
  82.  
  83.     def find(self, value, case_sensitive=False, offset=(0, 0), wrap=False):
  84.         """Find a table cell by the input `value`.
  85.  
  86.        Args:
  87.            value (str): The value to search for. For simplicity, `value` must
  88.                always be a string. A numerical string will match integers in
  89.                the table and empty strings will match `None`.
  90.            offset: (optional) A sequence of two integers deciding where the
  91.                search will start. The first number is the row, the second is
  92.                the column. The search will start at the offset, so the cell at
  93.                the offset could be matched.
  94.            wrap (bool): (optional) Whether or not to continue from the start of
  95.                the table when the end is reached without results.
  96.        Returns:
  97.            A `TableCell` instance or `None` if no results could be found.
  98.        """
  99.  
  100.         def search_cell(row_index, column_index, x):
  101.             # Get the cell x and ensure that it's a string.
  102.             cell_value = self.get_cell_value(row_index, column_index)
  103.             cell_value = self._stringify_value(cell_value)
  104.  
  105.             if not case_sensitive:
  106.                 cell_value = cell_value.lower()
  107.  
  108.             # Check if the search term is in the cell x.
  109.             if x in cell_value:
  110.                 return TableCell(self, row_index, column_index)
  111.  
  112.             return None
  113.  
  114.         if not case_sensitive:
  115.             value = value.lower()
  116.  
  117.         # Perform search.
  118.         for row_index in xrange(offset[0], self.row_count):
  119.             for column_index in xrange(self.column_count):
  120.  
  121.                 # Skip the cells that are before the offset.
  122.                 if row_index == offset[0] and column_index < offset[1]:
  123.                     continue
  124.  
  125.                 result = search_cell(row_index, column_index, value)
  126.                
  127.                 if result is not None:
  128.                     return result
  129.  
  130.         # Wrap search?
  131.         if wrap and (offset[0] > 0 or offset[1] > 0):
  132.             for row_index in xrange(offset[0] + 1):
  133.                 for column_index in xrange(self.column_count):
  134.  
  135.                     # Skip the cells that are after the offset.
  136.                     if row_index == offset[0] and column_index >= offset[1]:
  137.                         continue
  138.  
  139.                     result = search_cell(row_index, column_index, value)
  140.                    
  141.                     if result is not None:
  142.                         return result
  143.  
  144.         # No results? Daw :(
  145.         return None
  146.  
  147.     def each(self, fn, progress=None):
  148.         """
  149.        Run `fn` for each of the rows in the table.
  150.  
  151.        Args:
  152.            fn (function): A function that will be executed for each row in the
  153.                table with the respective item index and item as function
  154.                arguments.
  155.            progress (str): (optional) template for a status that will be
  156.                displayed in the console and updated for every iteration.
  157.        Example:
  158.            >>> def myfunction(index, item):
  159.            ...     print index, item[name]
  160.            >>>
  161.            >>> tbl.each(myfunction)
  162.        """
  163.  
  164.         # If a progress indicator is to be displayed, create the Progress class.
  165.         if progress is not None:
  166.             p = clitools.Progress(limit=len(self.rows), template=progress)
  167.         else:
  168.             p = None
  169.  
  170.         for i in range(0, len(self.rows)):
  171.             # Run the input function.
  172.             fn(i, self.get(i))
  173.  
  174.             # Update the progress indicator.
  175.             if p is not None:
  176.                 p.update()
  177.  
  178.         # We're done using the progress indicator.
  179.         if p: p.done()
  180.  
  181.     def remove_row(self, index):
  182.         del self.rows[index]
  183.  
  184.     def add_column(self, index, colname, fn=None, progress=None):
  185.         """
  186.        Add a column to the table. Equivalent of using `add_columns` with one
  187.        column name.
  188.        """
  189.  
  190.         # Wrap the fn function to make it return an array.
  191.         if fn:
  192.             fn_wrapper = lambda i, row: [fn(i, row)]
  193.  
  194.         self.add_columns(index, [colname], fn and fn_wrapper or fn, progress)
  195.  
  196.     def add_columns(self, index, colnames, fn=None, progress=None):
  197.         """
  198.        Add columns to the table.
  199.  
  200.        Args:
  201.            index (int): The index the columns will be inserted at.
  202.            colnames (iterable): An iterable of strings containing the column
  203.                names. This argument also specifies the amount of columns to
  204.                insert.
  205.            fn (callable): (optional) A callable that will be called for each
  206.                existing row with the arguments (index, item). The callable's
  207.                return value should be an iterable at least the length of
  208.                `colnames` which will be used to fill the new columns for the
  209.                current row. If this argument is not present the value of the
  210.                new columns will be set to `None` for all rows.
  211.        """
  212.  
  213.         if not fn:
  214.             def fn(*args): return None
  215.  
  216.         # If the user wants the progress displayed, initiate a Progress class.
  217.         if progress:
  218.             progress = clitools.Progress(limit=len(self.rows),
  219.                                          template=progress)
  220.  
  221.         # Duplicate column names not allowed.
  222.         for colname in colnames:
  223.             if colname in self.columns:
  224.                 raise TabularDataError('Column "%s" already exists.' % colname,
  225.                                        ERROR_DUPLICATE_COLNAME)
  226.  
  227.         # Insert the new columns.
  228.         for i, colname in enumerate(colnames):
  229.             self.columns.insert(index+i, colname)
  230.  
  231.         # Iterate over the rows and run `fn` for each one.
  232.         rownum = len(self.rows)
  233.         for i in xrange(0, rownum):
  234.  
  235.             # If a progress template was provided, update the progress for
  236.             # every iteration.
  237.             if progress:
  238.                 progress.update()
  239.  
  240.             val = fn(i, self.get(i))
  241.  
  242.             for num in xrange(0, len(colnames)):
  243.                 self.rows[i].insert(index+num, val and val[num] or None)
  244.  
  245.         # If a progress template was used, print a newline now.
  246.         if progress:
  247.             sys.stdout.write(u'\n')
  248.  
  249.     def append_column(self, colname, fn=None, progress=None):
  250.         """
  251.        Adds a column to the far right of the table. Equivalent of calling
  252.        `add_column` with the index being `len(columns)`.
  253.        """
  254.         self.add_column(len(self.columns), colname, fn, progress)
  255.  
  256.     def append_columns(self, colnames, fn=None, progress=None):
  257.         """
  258.        Adds columns to the far right of the table. Equivalent of calling
  259.        `add_columns` with the index being `len(columns)`.
  260.        """
  261.         self.add_columns(len(self.columns), colnames, fn, progress)
  262.  
  263.     def remove_column(self, index):
  264.         """Removes a column. See `Table.remove_columns`."""
  265.         self.remove_columns([index])
  266.  
  267.     def remove_columns(self, indexes):
  268.         """
  269.        Remove the table columns at the input indexes.
  270.  
  271.        Args:
  272.            indexes: An iterable yielding integers.
  273.        """
  274.         for index in reversed(sorted(indexes)):
  275.             del self.columns[index]
  276.  
  277.             for row in self.rows:
  278.                 del row[index]
  279.  
  280.     def insert_row(self, index, values=None):
  281.         """Insert a row to the table.
  282.  
  283.        Args:
  284.            index (int): The index in which to insert the row.
  285.            values: (optional) The values of the row. This must either be `None`
  286.                or a sequence type with the exact length as the number of
  287.                columns.
  288.        """
  289.         if values is None:
  290.             values = [None for x in xrange(0, self.column_count)]
  291.         else:
  292.             values = list(values)
  293.  
  294.         assert len(values) == self.column_count, 'Invalid item count in row'
  295.  
  296.         self.rows.insert(index, values)
  297.  
  298.     def append_row(self, values=None):
  299.         """Adds a row to the bottom of the table.
  300.  
  301.        Args:
  302.            values: (optional) See `Table.insert_row`.
  303.        """
  304.         self.insert_row(self.row_count, values)
  305.  
  306.     def to_text(self, cols=None, max_width='auto', from_row=None, to_row=None):
  307.         """Return a text representation of the table"""
  308.  
  309.         if not cols:
  310.             cols = self.columns
  311.  
  312.         # Default the max width of the table to the width of the console.
  313.         if max_width == 'auto':
  314.             max_width = console_width() - 2
  315.  
  316.         # Prepare the column headers to draw. Is cols is set, draw only the
  317.         # columns in cols.
  318.         columns = ['#'] + [x for x in self.columns if x in cols]
  319.  
  320.         # Prepare the rows to draw. If cols is set, filter which cells are
  321.         # displayed by those columns.
  322.         rows = []
  323.  
  324.         for rowindex, row in enumerate(self.rows):
  325.             # Honor the from_row and to_row arguments.
  326.             if from_row is not None and rowindex < from_row:
  327.                 continue
  328.             if to_row is not None and rowindex > to_row:
  329.                 continue
  330.  
  331.             r = [self.from_row + rowindex]
  332.             for i, cell in enumerate(row):
  333.                 # Filter which columns are displayed for every row.
  334.                 if self.columns[i] in cols:
  335.                     r.append(cell is not None and cell or u'')
  336.             rows.append(r)
  337.  
  338.         table = texttable.Texttable(max_width=max_width)
  339.         table.set_cols_dtype(['t'] * len(columns))
  340.         table.add_rows([columns] + rows)
  341.  
  342.         return table.draw()
  343.  
  344.     def save(self, path):
  345.         """
  346.        Save this `Table` instance to a file. The file format depends on the
  347.        file extendion of `path`.
  348.  
  349.        Args:
  350.            path (str): The `Table` instance will be saved to this path. The
  351.                file format will be chosen automatically based on the file
  352.                extension of `path`.
  353.        """
  354.  
  355.         # Extract the file extension.
  356.         ext = os.path.splitext(path)[1][1:]
  357.  
  358.         if ext not in SAVE_EXT_MAPPING:
  359.             raise TabularDataError(
  360.                 'Unknown file extension: {}'.format(repr(ext)),
  361.                 ERROR_UNSUPPORTED_FILE_EXTENSION
  362.             )
  363.  
  364.         # Fetch the save function for this file extension.
  365.         fn = get_from_self(SAVE_EXT_MAPPING[ext])
  366.  
  367.         fn(self, path)
  368.  
  369.     # Properties.
  370.     row_count = property(get_row_count)
  371.     column_count = property(get_column_count)
  372.     column_headers = property(get_column_headers)
  373.  
  374. class TableCell(object):
  375.     """A table cell for use with the `Table`."""
  376.  
  377.     def __init__(self, table, row_index, column_index):
  378.         self.table = table
  379.         self.row_index = row_index
  380.         self.column_index = column_index
  381.  
  382.     def __unicode__(self):
  383.         return unicode(str(self))
  384.  
  385.     def __str__(self):
  386.         return self.value
  387.  
  388.     def __repr__(self):
  389.         return '<TableCell(row={}, col={}, value={})>'.format(
  390.             self.row_index, self.column_index, repr(self.value)
  391.         )
  392.  
  393.     def get_value(self):
  394.         return self.table.get_cell_value(self.row_index, self.column_index)
  395.  
  396.     def set_value(self, value):
  397.         self.table.set_cell_value(self.row_index, self.column_index)
  398.  
  399.     # Properties
  400.  
  401.     value = property(get_value, set_value)
  402.     position = property(lambda self: (self.row_index, self.column_index))
  403.  
  404. class TabularDataError(Exception):
  405.     pass
  406.  
  407. def from_file(path, from_row=None, to_row=None, progress=None):
  408.     """
  409.    Method for loading a tabular data file.
  410.  
  411.    Args:
  412.        path (str): The path to the file.
  413.    Returns:
  414.        A `Table` instance representing the file.
  415.    """
  416.  
  417.     # Extract the file extension.
  418.     ext = os.path.splitext(path)[1][1:]
  419.  
  420.     # Do we know this file extension?
  421.     if ext not in LOAD_EXT_MAPPING:
  422.         raise TabularDataError(
  423.             'Unknown file extension: {}'.format(repr(ext)),
  424.             ERROR_UNSUPPORTED_FILE_EXTENSION
  425.         )
  426.  
  427.     # Fetch a reference to the function responsible for loading the provided
  428.     # file format.
  429.     fn = get_from_self(LOAD_EXT_MAPPING[ext])
  430.  
  431.     return fn(path, from_row, to_row, progress)
  432.  
  433. def load_xlsx_file(path, from_row=None, to_row=None, progress=None):
  434.     """
  435.    Function for loading an Excel file (xlsx) using the openpyxl library.
  436.  
  437.    Note:
  438.        If the file contains multiple sheets only the first one will be used.
  439.  
  440.    Args:
  441.        path (str): The path of the Excel file.
  442.    Returns:
  443.        A `Table` representation of the file.
  444.    """
  445.     from openpyxl import load_workbook
  446.  
  447.     # Wrap progress so we don't have to check it for every usage.
  448.     if progress is None:
  449.         progress = lambda *args: None
  450.  
  451.     # Ready a Table instance.
  452.     table = Table()
  453.     table.from_row = from_row is None and 0 or from_row
  454.  
  455.     # Load the Excel file.
  456.     wb = load_workbook(path, use_iterators=True)
  457.     sheet = wb.worksheets[0]
  458.  
  459.     rownum = sheet.get_highest_row() - 1
  460.     # colnum = letter_to_index(sheet.get_highest_column()) + 1
  461.  
  462.     # Get the column widths.
  463.     # column_widths = get_xlsx_column_widths(worksheet=sheet)
  464.  
  465.     def _process_cell(x):
  466.         """
  467.        This function decides which type to interpret cell value `x` as.
  468.  
  469.        * If `x` is a number, it's returned unchanged UNLESS it's a round
  470.          float, in which case it is converted to an int.
  471.  
  472.        * If `x` is a string type it is stripped of enclosing whitespace and if
  473.          it is 100% whitespace, `None` is returned. Otherwise a conversion to
  474.          a number is attempted.
  475.  
  476.        * If `x` is a `str` it is converted to unicode.
  477.        """
  478.  
  479.         if type(x) in [int, long, float]:
  480.             # If this is a float with a round number, convert it to an int.
  481.             if x % 1 == 0:
  482.                 return int(x)
  483.  
  484.             return x
  485.  
  486.         if type(x) in [str, unicode]:
  487.             x = x.strip()
  488.  
  489.             if len(x) < 1:
  490.                 return None
  491.  
  492.             try:
  493.                 return int(x)
  494.             except ValueError:
  495.                 pass
  496.  
  497.             try:
  498.                 return float(x.replace(',', '.'))
  499.             except ValueError:
  500.                 pass
  501.  
  502.         if type(x) is str:
  503.             return unicode(str)
  504.  
  505.         return x
  506.  
  507.     # Iterate over each row. The first row is the table columns, so treat the
  508.     # remaining rows as if their index was i-1.
  509.     for rowindex, row in enumerate(sheet.iter_rows()):
  510.  
  511.         if rowindex > 0:
  512.             # If from_row is set and this row is below that index, skip.
  513.             if from_row is not None and rowindex - 1 < from_row:
  514.                 continue
  515.             # If to_row is set and this row is above that index, stop.
  516.             if to_row is not None and rowindex - 1 > to_row:
  517.                 break
  518.  
  519.             table.rows.append([])
  520.  
  521.         # Iterate over each cell in the row.
  522.         for cell in row:
  523.             if rowindex == 0:
  524.                 table.columns.append(_process_cell(cell.internal_value))
  525.             else:
  526.                 table.rows[-1].append(_process_cell(cell.internal_value))
  527.  
  528.         progress(rowindex, rownum)
  529.  
  530.     # table.column_widths = column_widths[0:len(table.columns)]
  531.  
  532.     # Return the table.
  533.     return table
  534.  
  535. def get_xlsx_column_widths(filepath=None, worksheet=None):
  536.     """Fetches the column widths from the Excel 2007 (xlsx) file using the
  537.    openpyxl library.
  538.  
  539.    Must supply ONE of the arguments, not both.
  540.  
  541.    Args:
  542.        filepath (str): The path to the Excel file.
  543.        worksheet: The openpyxl Worksheet.
  544.    Returns:
  545.        Returns a list of floats.
  546.    """
  547.     if filepath is not None:
  548.         from openpyxl import load_workbook
  549.  
  550.         wb = load_workbook(filepath)
  551.         sheet = wb.worksheets[0]
  552.     elif worksheet is not None:
  553.         sheet = worksheet
  554.     else:
  555.         raise ValueError('Must supply either filepath or worksheet.')
  556.  
  557.     widths = sheet.column_dimensions.iteritems()
  558.     widths = sorted(widths, key=lambda x: x[0])
  559.  
  560.     output = []
  561.  
  562.     for key, value in widths:
  563.         output.append(value.width)
  564.  
  565.     return output
  566.  
  567. def save_xlsx_file(table, path):
  568.     """
  569.    Function for saving a `Table` instance to an Excel 2007 (xlsx) file using
  570.    the openpyxl library.
  571.  
  572.    Args:
  573.        table (tabulardata.Table): The table to save.
  574.        path (str): The path the table will be saved to.
  575.    """
  576.     from openpyxl import Workbook
  577.  
  578.     # Create an Excel workbook for the output.
  579.     wb = Workbook()
  580.  
  581.     # Create a new sheet in the workbook for the table.
  582.     ws = wb.worksheets[0]
  583.  
  584.     # Write the columns to the sheet.
  585.     ws.append(table.columns)
  586.  
  587.     # Iterate over the table rows and append them to the worksheet.
  588.     for row in table.rows:
  589.         ws.append([item or '' for item in row])
  590.  
  591.     wb.save(path)
  592.  
  593. def letter_to_index(letter):
  594.     """Converts a column letter, e.g. "A", "B", "AA", "BC" etc. to a zero based
  595.    column index.
  596.  
  597.    A becomes 0, B becomes 1, Z becomes 25, AA becomes 26 etc.
  598.  
  599.    Args:
  600.        letter (str): The column index letter.
  601.    Returns:
  602.        The column index as an integer.
  603.    """
  604.     letter = letter.upper()
  605.     result = 0
  606.  
  607.     for index, char in enumerate(reversed(letter)):
  608.         # Get the ASCII number of the letter and subtract 64 so that A
  609.         # corresponds to 1.
  610.         num = ord(char) - 64
  611.  
  612.         # Multiply the number with 26 to the power of `index` to get the correct
  613.         # value of the letter based on it's index in the string.
  614.         final_num = (26 ** index) * num
  615.  
  616.         result += final_num
  617.  
  618.     # Subtract 1 from the result to make it zero-based before returning.
  619.     return result - 1
  620.  
  621. def get_from_self(name):
  622.     """Shortcut function for fetching an item from this module"""
  623.     self = sys.modules[__name__]
  624.     return getattr(self, name)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement