tallydaorangez

Table.py

Sep 22nd, 2019 (edited)
544
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 22.97 KB | None | 0 0
  1. from __future__ import annotations
  2. from tabulate import tabulate
  3. from btree import Btree
  4. import pickle
  5. import os
  6. import math
  7. from misc import get_op, split_condition
  8.  
  9.  
  10. class Table:
  11.     '''
  12.    Table object represents a table inside a database
  13.  
  14.    A Table object can be created either by assigning:
  15.        - a table name (string)
  16.        - column names (list of strings)
  17.        - column types (list of functions like str/int etc)
  18.        - primary (name of the primary key column)
  19.  
  20.    OR
  21.  
  22.        - by assigning a value to the variable called load. This value can be:
  23.            - a path to a Table file saved using the save function
  24.            - a dictionary that includes the appropriate info (all the attributes in __init__)
  25.  
  26.    '''
  27.     def __init__(self, name=None, column_names=None, column_types=None, primary_key=None, load=None):
  28.  
  29.         if load is not None:
  30.             # if load is a dict, replace the object dict with it (replaces the object with the specified one)
  31.             if isinstance(load, dict):
  32.                 self.__dict__.update(load)
  33.                 # self._update()
  34.             # if load is str, load from a file
  35.             elif isinstance(load, str):
  36.                 self._load_from_file(load)
  37.  
  38.         # if name, columns_names and column types are not none
  39.         elif (name is not None) and (column_names is not None) and (column_types is not None):
  40.  
  41.             self._name = name
  42.  
  43.             if len(column_names)!=len(column_types):
  44.                 raise ValueError('Need same number of column names and types.')
  45.  
  46.             self.column_names = column_names
  47.  
  48.             self.columns = []
  49.  
  50.             for col in self.column_names:
  51.                 if col not in self.__dir__():
  52.                     # this is used in order to be able to call a column using its name as an attribute.
  53.                     # example: instead of table.columns['column_name'], we do table.column_name
  54.                     setattr(self, col, [])
  55.                     self.columns.append([])
  56.                 else:
  57.                     raise Exception(f'"{col}" attribute already exists in "{self.__class__.__name__} "class.')
  58.  
  59.             self.column_types = [eval(ct) if not isinstance(ct, type) else ct for ct in column_types]
  60.             self.data = [] # data is a list of lists, a list of rows that is.
  61.  
  62.             # if primary key is set, keep its index as an attribute
  63.             if primary_key is not None:
  64.                 self.pk_idx = self.column_names.index(primary_key)
  65.             else:
  66.                 self.pk_idx = None
  67.  
  68.             self.pk = primary_key
  69.             # self._update()
  70.  
  71.     # if any of the name, columns_names and column types are none. return an empty table object
  72.  
  73.     def column_by_name(self, column_name):
  74.         return [row[self.column_names.index(column_name)] for row in self.data]
  75.  
  76.  
  77.     def _update(self):
  78.         '''
  79.        Update all the available columns with the appended rows.
  80.        '''
  81.         self.columns = [[row[i] for row in self.data] for i in range(len(self.column_names))]
  82.         for ind, col in enumerate(self.column_names):
  83.             setattr(self, col, self.columns[ind])
  84.  
  85.     def _cast_column(self, column_name, cast_type):
  86.         '''
  87.        Cast all values of a column using a specified type.
  88.  
  89.        Args:
  90.            column_name: string. The column that will be casted.
  91.            cast_type: type. Cast type (do not encapsulate in quotes).
  92.        '''
  93.         # get the column from its name
  94.         column_idx = self.column_names.index(column_name)
  95.         # for every column's value in each row, replace it with itself but casted as the specified type
  96.         for i in range(len(self.data)):
  97.             self.data[i][column_idx] = cast_type(self.data[i][column_idx])
  98.         # change the type of the column
  99.         self.column_types[column_idx] = cast_type
  100.         # self._update()
  101.  
  102.  
  103.     def _insert(self, row, insert_stack=[]):
  104.         '''
  105.        Insert row to table.
  106.  
  107.        Args:
  108.            row: list. A list of values to be inserted (will be casted to a predifined type automatically).
  109.            insert_stack: list. The insert stack (empty by default).
  110.        '''
  111.         if len(row)!=len(self.column_names):
  112.             raise ValueError(f'ERROR -> Cannot insert {len(row)} values. Only {len(self.column_names)} columns exist')
  113.  
  114.         for i in range(len(row)):
  115.             # for each value, cast and replace it in row.
  116.             # try:
  117.             row[i] = self.column_types[i](row[i])
  118.             # except:
  119.             #     raise ValueError(f'ERROR -> Value {row[i]} of type {type(row[i])} is not of type {self.column_types[i]}.')
  120.  
  121.             # if value is to be appended to the primary_key column, check that it doesnt alrady exist (no duplicate primary keys)
  122.             if i==self.pk_idx and row[i] in self.column_by_name(self.pk):
  123.                 raise ValueError(f'## ERROR -> Value {row[i]} already exists in primary key column.')
  124.  
  125.         # if insert_stack is not empty, append to its last index
  126.         if insert_stack != []:
  127.             self.data[insert_stack[-1]] = row
  128.         else: # else append to the end
  129.             self.data.append(row)
  130.         # self._update()
  131.  
  132.     def _update_rows(self, set_value, set_column, condition):
  133.         '''
  134.        Update where Condition is met.
  135.  
  136.        Args:
  137.            set_value: string. The provided set value.
  138.            set_column: string. The column to be altered.
  139.            condition: string. A condition using the following format:
  140.                'column[<,<=,=,>=,>]value' or
  141.                'value[<,<=,=,>=,>]column'.
  142.  
  143.                Operatores supported: (<,<=,=,>=,>)
  144.        '''
  145.         # parse the condition
  146.         column_name, operator, value = self._parse_condition(condition)
  147.  
  148.         # get the condition and the set column
  149.         column = self.column_by_name(column_name)
  150.         set_column_idx = self.column_names.index(set_column)
  151.  
  152.         # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names]
  153.  
  154.         # for each value in column, if condition, replace it with set_value
  155.         for row_ind, column_value in enumerate(column):
  156.             if get_op(operator, column_value, value):
  157.                 self.data[row_ind][set_column_idx] = set_value
  158.  
  159.         # self._update()
  160.                 # print(f"Updated {len(indexes_to_del)} rows")
  161.  
  162.  
  163.     def _delete_where(self, condition):
  164.         '''
  165.        Deletes rows where condition is met.
  166.  
  167.        Important: delete replaces the rows to be deleted with rows filled with Nones.
  168.        These rows are then appended to the insert_stack.
  169.  
  170.        Args:
  171.            condition: string. A condition using the following format:
  172.                'column[<,<=,==,>=,>]value' or
  173.                'value[<,<=,==,>=,>]column'.
  174.  
  175.                Operatores supported: (<,<=,==,>=,>)
  176.        '''
  177.         column_name, operator, value = self._parse_condition(condition)
  178.  
  179.         indexes_to_del = []
  180.  
  181.         column = self.column_by_name(column_name)
  182.         for index, row_value in enumerate(column):
  183.             if get_op(operator, row_value, value):
  184.                 indexes_to_del.append(index)
  185.  
  186.         # we pop from highest to lowest index in order to avoid removing the wrong item
  187.         # since we dont delete, we dont have to to pop in that order, but since delete is used
  188.         # to delete from meta tables too, we still implement it.
  189.  
  190.         for index in sorted(indexes_to_del, reverse=True):
  191.             if self._name[:4] != 'meta':
  192.                 # if the table is not a metatable, replace the row with a row of nones
  193.                 self.data[index] = [None for _ in range(len(self.column_names))]
  194.             else:
  195.                 self.data.pop(index)
  196.  
  197.         # self._update()
  198.         # we have to return the deleted indexes, since they will be appended to the insert_stack
  199.         return indexes_to_del
  200.  
  201.  
  202.     def _select_where(self, return_columns, condition=None, order_by=None, desc=True, top_k=None):
  203.         '''
  204.        Select and return a table containing specified columns and rows where condition is met.
  205.  
  206.        Args:
  207.            return_columns: list. The columns to be returned.
  208.            condition: string. A condition using the following format:
  209.                'column[<,<=,==,>=,>]value' or
  210.                'value[<,<=,==,>=,>]column'.
  211.  
  212.                Operatores supported: (<,<=,==,>=,>)
  213.            order_by: string. A column name that signals that the resulting table should be ordered based on it (no order if None).
  214.            desc: boolean. If True, order_by will return results in descending order (False by default).
  215.            top_k: int. An integer that defines the number of rows that will be returned (all rows if None).
  216.        '''
  217.  
  218.         # if * return all columns, else find the column indexes for the columns specified
  219.         if return_columns == '*':
  220.             return_cols = [i for i in range(len(self.column_names))]
  221.         else:
  222.             return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')]
  223.  
  224.         # if condition is None, return all rows
  225.         # if not, return the rows with values where condition is met for value
  226.         if condition is not None:
  227.             column_name, operator, value = self._parse_condition(condition)
  228.             column = self.column_by_name(column_name)
  229.             rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)]
  230.         else:
  231.             rows = [i for i in range(len(self.data))]
  232.  
  233.         # top k rows
  234.         # rows = rows[:int(top_k)] if isinstance(top_k,str) else rows
  235.         # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned)
  236.         dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()}
  237.  
  238.         # we need to set the new column names/types and no of columns, since we might
  239.         # only return some columns
  240.         dict['column_names'] = [self.column_names[i] for i in return_cols]
  241.         dict['column_types']   = [self.column_types[i] for i in return_cols]
  242.  
  243.         s_table = Table(load=dict)
  244.         if order_by:
  245.             s_table.order_by(order_by, desc)
  246.  
  247.         s_table.data = s_table.data[:int(top_k)] if isinstance(top_k,str) else s_table.data
  248.  
  249.         return s_table
  250.  
  251.  
  252.     def _select_where_with_btree(self, return_columns, bt, condition, order_by=None, desc=True, top_k=None):
  253.  
  254.         # if * return all columns, else find the column indexes for the columns specified
  255.         if return_columns == '*':
  256.             return_cols = [i for i in range(len(self.column_names))]
  257.         else:
  258.             return_cols = [self.column_names.index(colname) for colname in return_columns]
  259.  
  260.  
  261.         column_name, operator, value = self._parse_condition(condition)
  262.  
  263.         # if the column in condition is not a primary key, abort the select
  264.         if column_name != self.column_names[self.pk_idx]:
  265.             print('Column is not PK. Aborting')
  266.  
  267.         # here we run the same select twice, sequentially and using the btree.
  268.         # we then check the results match and compare performance (number of operation)
  269.         column = self.column_by_name(column_name)
  270.  
  271.         # sequential
  272.         rows1 = []
  273.         opsseq = 0
  274.         for ind, x in enumerate(column):
  275.             opsseq+=1
  276.             if get_op(operator, x, value):
  277.                 rows1.append(ind)
  278.  
  279.         # btree find
  280.         rows = bt.find(operator, value)
  281.  
  282.         # same as simple select from now on
  283.         rows = rows[:top_k]
  284.         # TODO: this needs to be dumbed down
  285.         dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()}
  286.  
  287.         dict['column_names'] = [self.column_names[i] for i in return_cols]
  288.         dict['column_types']   = [self.column_types[i] for i in return_cols]
  289.  
  290.         s_table = Table(load=dict)
  291.         if order_by:
  292.             s_table.order_by(order_by, desc)
  293.  
  294.         s_table.data = s_table.data[:int(top_k)] if isinstance(top_k,str) else s_table.data
  295.  
  296.         return s_table
  297.  
  298.     def order_by(self, column_name, desc=True):
  299.         '''
  300.        Order table based on column.
  301.  
  302.        Args:
  303.            column_name: string. Name of column.
  304.            desc: boolean. If True, order_by will return results in descending order (False by default).
  305.        '''
  306.         column = self.column_by_name(column_name)
  307.         idx = sorted(range(len(column)), key=lambda k: column[k], reverse=desc)
  308.         # print(idx)
  309.         self.data = [self.data[i] for i in idx]
  310.         # self._update()
  311.  
  312.  
  313.     def _inner_join(self, table_right: Table, condition):
  314.         '''
  315.        Join table (left) with a supplied table (right) where condition is met.
  316.  
  317.        Args:
  318.            condition: string. A condition using the following format:
  319.                'column[<,<=,=,>=,>]value' or
  320.                'value[<,<=,=,>=,>]column'.
  321.  
  322.                Operatores supported: (<,<=,=,>=,>)
  323.        '''
  324.         # get columns and operator
  325.         column_name_left, operator, column_name_right = self._parse_condition(condition, join=True)
  326.         # try to find both columns, if you fail raise error
  327.         try:
  328.             column_index_left = self.column_names.index(column_name_left)
  329.         except:
  330.             raise Exception(f'Column "{column_name_left}" dont exist in left table. Valid columns: {self.column_names}.')
  331.  
  332.         try:
  333.             column_index_right = table_right.column_names.index(column_name_right)
  334.         except:
  335.             raise Exception(f'Column "{column_name_right}" dont exist in right table. Valid columns: {table_right.column_names}.')
  336.  
  337.         # get the column names of both tables with the table name in front
  338.         # ex. for left -> name becomes left_table_name_name etc
  339.         left_names = [f'{self._name}.{colname}' if self._name!='' else colname for colname in self.column_names]
  340.         right_names = [f'{table_right._name}.{colname}' if table_right._name!='' else colname for colname in table_right.column_names]
  341.  
  342.         # define the new tables name, its column names and types
  343.         join_table_name = ''
  344.         join_table_colnames = left_names+right_names
  345.         join_table_coltypes = self.column_types+table_right.column_types
  346.         join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types= join_table_coltypes)
  347.  
  348.         # count the number of operations (<,> etc)
  349.         no_of_ops = 0
  350.         # this code is dumb on purpose... it needs to illustrate the underline technique
  351.         # for each value in left column and right column, if condition, append the corresponding row to the new table
  352.         for row_left in self.data:
  353.             left_value = row_left[column_index_left]
  354.             for row_right in table_right.data:
  355.                 right_value = row_right[column_index_right]
  356.                 no_of_ops+=1
  357.                 if get_op(operator, left_value, right_value): #EQ_OP
  358.                     join_table._insert(row_left+row_right)
  359.  
  360.         return join_table
  361.  
  362.     def _inlj_inner_join(self, table_right: Table, condition):
  363.         '''
  364.        Join table (left) with a supplied table (right) where condition is met, using the Index Nested-Loops Join method
  365.  
  366.        Args:
  367.            condition: string. A condition using the following format:
  368.                'column[<,<=,=,>=,>]value' or
  369.                'value[<,<=,=,>=,>]column'.
  370.  
  371.                Operatores supported: (<,<=,=,>=,>)
  372.        '''
  373.         #select * from student inner join department on dept_name=dept_name
  374.  
  375.         # get columns and operator
  376.         column_name_left, operator, column_name_right = self._parse_condition(condition, join=True)
  377.         #Finding the maximum height of the Btree
  378.         tuple_count = table_right.column_by_name(table_right.pk)
  379.         height = round(math.log(len(tuple_count)))
  380.         #Create the index
  381.         Btree_index = Btree(height)
  382.  
  383.         #Insert into Btree records of the right table, sorted on the right column of the condition.
  384.         for order_key, record in enumerate(table_right.column_by_name(table_right.pk)):
  385.             Btree_index.insert(record, order_key)
  386.  
  387.         # try to find left column, if you fail raise error
  388.         try:
  389.             column_index_left = self.column_names.index(column_name_left)
  390.         except:
  391.             raise Exception(f'Column "{column_name_left}" dont exist in left table. Valid columns: {self.column_names}.')
  392.  
  393.         # get the column names of both tables with the table name in front
  394.         # ex. for left -> name becomes left_table_name_name etc
  395.         left_names = [f'{self._name}.{colname}' if self._name!='' else colname for colname in self.column_names]
  396.         right_names = [f'{table_right._name}.{colname}' if table_right._name!='' else colname for colname in table_right.column_names]
  397.  
  398.         # define the new tables name, its column names and types
  399.         join_table_name = ''
  400.         join_table_colnames = left_names+right_names
  401.         join_table_coltypes = self.column_types+table_right.column_types
  402.         join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types= join_table_coltypes)
  403.  
  404.         #Index Nested-Loops Join
  405.         #for each record in left table:
  406.         #   search record[wanted_column] in Btree index of right table
  407.         #       if found then add to new table (record from left table, record from right table)
  408.         for outer_record in self.data:
  409.                         left_record = outer_record[column_index_left]
  410.                         Matching_index = Btree_index.find(operator,left_record)
  411.                         if(len(Matching_index)>0):
  412.                             for match_id in Matching_index:
  413.                                 join_table._insert(outer_record + table_right.data[match_id])
  414.  
  415.         return join_table
  416.  
  417.     def _smj_inner_join(self, table_right: Table, condition):
  418.         '''
  419.        Join table (left) with a supplied table (right) where condition is met, using the 2-way External Merge-Sort
  420.        algorithm, to support the Sort-Merge Join method
  421.  
  422.        Args:
  423.            condition: string. A condition using the following format:
  424.                'column[<,<=,=,>=,>]value' or
  425.                'value[<,<=,=,>=,>]column'.
  426.  
  427.                Operatores supported: (<,<=,=,>=,>)
  428.        '''
  429.         #select * from student inner join advisor on id=s_id
  430.  
  431.         # get the column names of both tables with the table name in front
  432.         # get columns and operator
  433.         column_name_left, operator, column_name_right = self._parse_condition(condition, join=True)
  434.  
  435.         # ex. for left -> name becomes left_table_name_name etc
  436.         left_names = [f'{self._name}.{colname}' if self._name!='' else colname for colname in self.column_names]
  437.         right_names = [f'{table_right._name}.{colname}' if table_right._name!='' else colname for colname in table_right.column_names]
  438.  
  439.         # define the new tables name, its column names and types
  440.         join_table_name = ''
  441.         join_table_colnames = left_names+right_names
  442.         join_table_coltypes = self.column_types+table_right.column_types
  443.         join_table = Table(name=join_table_name, column_names=join_table_colnames, column_types= join_table_coltypes)
  444.  
  445.         #Sort-Merge Join
  446.         #Sort left table on left column of condition and right table on right column of condition
  447.         self.order_by(column_name_left,False)
  448.         table_right.order_by(column_name_right,False)
  449.  
  450.         #Get number of rows for each table
  451.         left_datalen = len(self.data)
  452.         right_datalen = len(table_right.data)
  453.  
  454.         #2-way External Merge-Sort
  455.         l_idx, r_idx = 0, 0
  456.         #While there are still unchecked rows:
  457.         #   If condition is true, insert to new table (record from left table, record from right table)
  458.         #       else get the next record from the table with the record that has the lower value
  459.         while l_idx < left_datalen and r_idx < right_datalen:
  460.             if(self.column_by_name(column_name_left)[l_idx] == table_right.column_by_name(column_name_right)[r_idx]):
  461.                 join_table._insert(self.data[l_idx]+table_right.data[r_idx])
  462.                 l_idx+=1
  463.                 r_idx+=1
  464.             elif(self.data[l_idx] < table_right.data[r_idx]):
  465.                 l_idx+=1
  466.             else:
  467.                 r_idx+=1
  468.  
  469.         return join_table
  470.  
  471.     def show(self, no_of_rows=None, is_locked=False):
  472.         '''
  473.        Print the table in a nice readable format.
  474.  
  475.        Args:
  476.            no_of_rows: int. Number of rows.
  477.            is_locked: boolean. Whether it is locked (False by default).
  478.        '''
  479.         output = ""
  480.         # if the table is locked, add locked keyword to title
  481.         if is_locked:
  482.             output += f"\n## {self._name} (locked) ##\n"
  483.         else:
  484.             output += f"\n## {self._name} ##\n"
  485.  
  486.         # headers -> "column name (column type)"
  487.         headers = [f'{col} ({tp.__name__})' for col, tp in zip(self.column_names, self.column_types)]
  488.         if self.pk_idx is not None:
  489.             # table has a primary key, add PK next to the appropriate column
  490.             headers[self.pk_idx] = headers[self.pk_idx]+' #PK#'
  491.         # detect the rows that are no tfull of nones (these rows have been deleted)
  492.         # if we dont skip these rows, the returning table has empty rows at the deleted positions
  493.         non_none_rows = [row for row in self.data if any(row)]
  494.         # print using tabulate
  495.         print(tabulate(non_none_rows[:no_of_rows], headers=headers)+'\n')
  496.  
  497.  
  498.     def _parse_condition(self, condition, join=False):
  499.         '''
  500.        Parse the single string condition and return the value of the column and the operator.
  501.  
  502.        Args:
  503.            condition: string. A condition using the following format:
  504.                'column[<,<=,==,>=,>]value' or
  505.                'value[<,<=,==,>=,>]column'.
  506.  
  507.                Operatores supported: (<,<=,==,>=,>)
  508.            join: boolean. Whether to join or not (False by default).
  509.        '''
  510.         # if both_columns (used by the join function) return the names of the names of the columns (left first)
  511.         if join:
  512.             return split_condition(condition)
  513.  
  514.         # cast the value with the specified column's type and return the column name, the operator and the casted value
  515.         left, op, right = split_condition(condition)
  516.         if left not in self.column_names:
  517.             raise ValueError(f'Condition is not valid (cant find column name)')
  518.         coltype = self.column_types[self.column_names.index(left)]
  519.  
  520.         return left, op, coltype(right)
  521.  
  522.  
  523.     def _load_from_file(self, filename):
  524.         '''
  525.        Load table from a pkl file (not used currently).
  526.  
  527.        Args:
  528.            filename: string. Name of pkl file.
  529.        '''
  530.         f = open(filename, 'rb')
  531.         tmp_dict = pickle.load(f)
  532.         f.close()
  533.  
  534.         self.__dict__.update(tmp_dict)
Add Comment
Please, Sign In to add comment