Advertisement
lamiastella

relation_algebra.py

Sep 20th, 2017
271
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.42 KB | None | 0 0
  1. # TODO:
  2. # 1. Remove asserts and replace with exceptions.
  3. # 2. There is a lot of unpythonic code in here, someone who likes python can fix it :)
  4.  
  5. def get_result(x): return [tuple(t) for t in x]
  6.  
  7. def generate_dict(t, schema_index):
  8.     d = dict()
  9.     for x in schema_index.keys():
  10.         d[x] = t[schema_index[x]]
  11.     return tuple(sorted(list(d.iteritems())))
  12.  
  13. # Schema-aware comparison
  14. def compare_results(x,y):
  15.     # First check the results are the same tupe.
  16.     if set(x.schema) != set(y.schema): return False
  17.  
  18.     # Now, we want to compare them as sets but the attribute orders may be different,
  19.     # so we turn each tuple into a dictionary
  20.     xd = map(lambda t : generate_dict(t, x.schema_index),get_result(x))
  21.     yd = map(lambda t : generate_dict(t, y.schema_index),get_result(y))
  22.     return set(xd) == set(yd)
  23.  
  24. class OpBase:
  25.     def __init__(self, schema, children):
  26.         self.schema = list(schema)
  27.         self.schema_index = dict([(b,a) for (a,b) in enumerate(self.schema)])
  28.         self.in_count = 0
  29.         self.out_count = 0
  30.         self.children = children
  31.         #self.count_reads = True
  32.         #self.op_str   = None
  33.    
  34.     def __repr__(self): return "{0}".format(self.schema)
  35.    
  36.     # Get an index for an attribute
  37.     def resolve_attribute(self, x):
  38.         if x not in self.schema:
  39.                 raise NameError("{0} not found in {1}".format(x,self.schema))
  40.         return self.schema_index[x]
  41.    
  42.     # helper function to resolve many attributes
  43.     def resolve_attributes(self, attr):
  44.         return [self.resolve_attribute(x) for x in attr]
  45.  
  46.     # Code for counting the number of tuples "flowing by"
  47.     def reset_count(self):
  48.         self.in_count  = 0
  49.         self.out_count = 0
  50.         for c in self.children: c.reset_count()
  51.  
  52.     def total_count(self):
  53.         return self.in_count + sum([c.total_count() for c in self.children if c.count_reads])
  54.  
  55.     def count_str(self, offset):
  56.         out = " "*(4*offset) + "*" + " {0} ".format(self.op_str)
  57.         if self.count_reads:
  58.             out += "[tuples read in: {0} out: {1}]".format(self.in_count, self.out_count)
  59.         return out + "\n"
  60.  
  61.     def toCount(self, offset=0):
  62.         return self.count_str(offset) + ''.join([c.toCount(offset+1) for c in self.children])
  63.    
  64. # This takes in a relation with an optional name.
  65. # All operators yield a set of tuples and define a schema
  66. class BaseRelation(OpBase):
  67.     def __init__(self, res, name=None):
  68.         self.res  = res
  69.         self.name = name
  70.         self.count_reads = False
  71.         OpBase.__init__(self, res.keys, [])
  72.         self.op_str = "{0}({1}) has {2} tuples".format(self.name, ','.join(self.schema), len(self.res))
  73.        
  74.     def __iter__(self):
  75.         for r in self.res:
  76.             self.in_count += 1
  77.             self.out_count += 1
  78.             yield r
  79.  
  80.     def toMD(self):
  81.         return "{0}({1})".format(self.name, ','.join(self.schema))
  82.                
  83. class Select(OpBase):
  84.     """Selection attr=val"""
  85.     def __init__(self, attr,val,op):
  86.         self.in_op     = op
  87.         self.attr      = attr
  88.         self.v         = val
  89.         in_schema      = self.in_op.schema
  90.         self.op_str    = "$\sigma_{{{0}={1}}}$".format(self.attr, self.v)
  91.         assert(attr in in_schema) # TOOD: replace with an exception!
  92.         OpBase.__init__(self, in_schema, [self.in_op]) # Schema Unchanged
  93.         self.count_reads = True
  94.        
  95.     def __iter__(self):
  96.         idx = self.in_op.resolve_attribute(self.attr)
  97.         for r in self.in_op:
  98.             self.in_count += 1
  99.             if r[idx] == self.v:
  100.                 self.out_count += 1
  101.                 yield r
  102.                
  103.     def toMD(self):
  104.         return "$\sigma_{{{0}={1}}}$({2})".format(self.attr, self.v, self.in_op.toMD())
  105.    
  106. class Project(OpBase):
  107.     """Projection."""
  108.     def __init__(self, attributes, op):
  109.         self.attributes = attributes
  110.         self.in_op      = op
  111.         self.op_str     =  "$\Pi_{{{0}}}$".format(self.attributes)
  112.         assert(all([x in self.in_op.schema for x in attributes])) # TODO: replace with an exception
  113.         OpBase.__init__(self, self.attributes, [self.in_op]) # Schema changes!
  114.         self.count_reads = True
  115.    
  116.     def project_helper(self, idx_list, t):
  117.         return tuple([t[i] for i in idx_list])
  118.        
  119.     def __iter__(self):
  120.         idx_list = self.in_op.resolve_attributes(self.attributes)
  121.         # Remove duplicates
  122.         in_e = [self.project_helper(idx_list,t) for t in self.in_op]
  123.         self.in_count += len(in_e)
  124.         for u in set(in_e):
  125.             self.out_count += 1
  126.             yield u
  127.            
  128.     def toMD(self):
  129.         return "$\Pi_{{{0}}}$({1})".format(','.join(self.attributes), self.in_op.toMD())
  130.  
  131. class CrossProduct(OpBase):
  132.     """Cross Product"""
  133.     def __init__(self, op1, op2):
  134.         self.l      = op1
  135.         self.r      = op2
  136.         s1 = set(op1.schema)
  137.         s2 = set(op2.schema)
  138.         self.op_str = "$\times$"
  139.  
  140.         # Make sure the schemas are distinct
  141.         if len(s1.intersection(s2)) != 0:
  142.             raise ValueError("Schemas must be distinct!")
  143.         OpBase.__init__(self, s1.union(s2), [op1,op2]) # Schema changes!
  144.         self.count_reads = True
  145.        
  146.     def __iter__(self):
  147.         for x in self.l:
  148.             self.in_count += 1
  149.             for y in self.r:
  150.                 self.in_count += 1
  151.                 self.out_count += 1
  152.                 yield tuple(x) + tuple(y)
  153.  
  154.     def toMD(self):
  155.         return "{0} $\\times$ {1}".format(self.l.toMD(), self.r.toMD())
  156.  
  157. class NJoin(OpBase):
  158.     """Natural Join"""
  159.     def __init__(self, op1, op2):
  160.         self.l      = op1
  161.         self.r      = op2
  162.         s1          = set(op1.schema)
  163.         s2          = set(op2.schema)
  164.         self.common = s1.intersection(s2)
  165.         self.op_str = "$\Join_{{{0}}}$".format(','.join(self.common))
  166.         OpBase.__init__(self, op1.schema + filter(lambda x : x not in self.common, op2.schema), [op1,op2])
  167.         self.count_reads = True
  168.        
  169.     def __iter__(self):
  170.  
  171.         # return common attributes in index-order of the *left* relation
  172.         idx_cl = sorted(self.l.resolve_attributes(self.common))
  173.         common_left_order = [c for i,c in sorted([(self.l.resolve_attribute(c),c) for c in self.common])]
  174.         idx_cr = self.r.resolve_attributes(common_left_order)
  175.        
  176.         # return the attributes unique to the right relation in the *right* relation's order
  177.         idx_r  = sorted(self.r.resolve_attributes(set(self.r.schema).difference(self.common)))
  178.         for x in self.l:
  179.             self.in_count += 1
  180.             for y in self.r:
  181.                 self.in_count += 1
  182.                 if all([x[idx_cl[i]] == y[idx_cr[i]] for i in range(len(self.common))]):
  183.                     self.out_count += 1
  184.                     ty = tuple([y[i] for i in idx_r])
  185.                     yield tuple(x) + tuple(ty)
  186.                    
  187.     def toMD(self):
  188.         return "( {0} ) $\Join_{{{2}}}$ ( {1} )".format(self.l.toMD(), self.r.toMD(), ','.join(self.common))
  189.    
  190. class Union(OpBase):
  191.     """Union"""
  192.     def __init__(self, op1, op2):
  193.         self.l      = op1
  194.         self.r      = op2
  195.         self.op_str = "$\\bigcup$"
  196.         assert(op1.schema == op2.schema)
  197.         OpBase.__init__(self, op1.schema, [op1,op2])
  198.         self.count_reads = True
  199.        
  200.     def __iter__(self):
  201.         ll = get_result(self.l)
  202.         rl = get_result(self.r)
  203.         self.in_count += len(ll)
  204.         self.in_count += len(rl)
  205.         ls = set(ll)
  206.         rs = set(rl)
  207.         for x in ls.union(rs):
  208.             self.out_count += 1
  209.             yield x
  210.        
  211.     def toMD(self):
  212.         return "( {0} ) $\\bigcup$ ( {1} )".format(self.l.toMD(), self.r.toMD())
  213.  
  214. class Difference(OpBase):
  215.     """Difference"""
  216.     def __init__(self, op1, op2):
  217.         self.l      = op1
  218.         self.r      = op2
  219.         self.op_str = "-"
  220.         assert(op1.schema == op2.schema)
  221.         OpBase.__init__(self, op1.schema, [op1,op2])
  222.         self.count_reads = True
  223.        
  224.     def __iter__(self):
  225.         ll = get_result(self.l)
  226.         rl = get_result(self.r)
  227.         self.in_count += len(ll)
  228.         self.in_count += len(rl)
  229.         ls = set(ll)
  230.         rs = set(rl)
  231.         for x in ls.difference(rs):
  232.             self.out_count += 1
  233.             yield x
  234.        
  235.     def toMD(self):
  236.         return "( {0} ) - ( {1} )".format(self.l.toMD(), self.r.toMD())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement