arma.py

# CS1656 Spring 2019 Assignment 3: Association Rule Mining
# Author: Michael Korst (mpk44@pitt.edu) GH username: paranoidandroid81

import csv, os, argparse, itertools

# helper to generate unique permutations for 2-partitions of varying sizes
# return dict mapping size to part sets
def gen_permuts(combos, size):
  # for each permutation create partitions of up to size 2
  size_to_part = {}
  for permut in combos:
    for i in range(size - 1):
      part_1 = permut[:(i + 1)] # generate partition 1 of size i + 1
      part_2 = permut[(i + 1):] # generate partition 2 of size (size -1 - i)
      # sort and turn into to string for hashing
      part_1 = ''.join(sorted(part_1))
      part_2 = ''.join(sorted(part_2))
      if len(part_1) not in size_to_part.keys():
        size_to_part[len(part_1)] = set()
      size_to_part[len(part_1)].add(part_1)
      if len(part_2) not in size_to_part.keys():
        size_to_part[len(part_2)] = set()
      size_to_part[len(part_2)].add(part_2)
  return size_to_part

# helper to run thru 1 iteration of apriori association rule searching
def apriori_assoc(vfis, size, input_sets, num_trans, min_sup, min_conf):
  # first generate possible permutations based on each item in vfis
  rule_to_stats = {} # map found rules to support, confidence
  v_combos = [] # list of lists of combos/permutations for each vfi
  for v_key in vfis.keys():
    vfi_items = []
    split_key = v_key.split(',') # split by comma
    for item in split_key:
      vfi_items.append(item)
    vfi_items = set(vfi_items) # remove repeats
    v_combos.append(list(itertools.permutations(vfi_items, size)))
  for combo in v_combos:
    item_key = combo[0]
    item_key = ','.join(item_key) # string key for each item
    whole_sup = get_sup(combo, input_sets, num_trans) # get sup of Slinte combo
    s2p = gen_permuts(combo, size) # get size to partition mappingsSlint
    for part_size in s2p.keys():
      # go thru each possible size, create partitions
      part2_size = size - part_size # size of 2nd partition = size Slinte 1st
      for part_1 in s2p[part_size]:
        part_key = ','.join(part_1)
        part_sup = get_sup([part_1], input_sets, num_trans)
        for part_2 in s2p[part2_size]:
          # as as element not repeated in either part, we can checkSlintrule
          if part_2 not in part_1 and part_1 not in part_2:
            part2_key = ','.join(part_2)
            conf = float(whole_sup[item_key] / part_sup[part_key])
            # if min sup and min conf, add rule i
            if whole_sup[item_key] >= min_sup and conf >= min_conf:
              conf_key = f"{part_key},'=>',{part2_key}"
              rule_to_stats[conf_key] = (whole_sup[item_key], conf)

  return rule_to_stats

# helper to run thru 1 iteration of apriori frequency pruning
def apriori_prune(cands, size, input_sets, num_trans, min_sup, min_conf):
  # find support % for each candidate
  cand_to_sup = get_sup(cands, input_sets, num_trans)
  # get only support percentages >= min_sup (verified frequent)
  vfis = {k: v for k, v in cand_to_sup.items() if v >= min_sup}
  # generate candidates based on vfis
  cands = gen_next_cands(vfis, size)
  # return tuple of verified, candidates
  return (vfis, cands)

# helper to find appearances of set in input data, calculate support percentages
def get_sup(items, input_sets, num_trans):
  items_to_sup = {} # map each item to its support count (later percentage)
  # now go thru eacb item, counting appearances in input data
  for item in items:
    item_key = ','.join(item) # string key for each item
    for trans in input_sets:
      if set(item) <= trans:
        if item_key not in items_to_sup.keys():
          items_to_sup[item_key] = 1
        else:
          items_to_sup[item_key] += 1
  # now convert support counts to percentages
  items_to_sup.update((x, float(y / num_trans)) for x, y in items_to_sup.items())
  return items_to_sup

# helper to generate plausible candidate itemsets of size + 1 based on vfis
def gen_next_cands(vfis, size):
  # first generate list based on individual items in vfis
  vfi_items = []
  for v_key in vfis.keys():
    split_key = sorted(v_key.split(',')) # split by comma, sort
    for item in split_key:
      vfi_items.append(item)
  vfi_items = set(vfi_items) # get rid of repeats
  all_possible = list(itertools.combinations(vfi_items, (size + 1)))
  # convert all vfis to sets
  vfi_sets = []
  for v_key in vfis.keys():
    split_key = v_key.split(',') # avoid comma
    vfi_sets.append(set(split_key))
  # now based on all possible, ensure all subsets are in vfis
  next_cands = []
  for poss in all_possible:
    all_subs = list(itertools.combinations(poss, size))
    found_subs = 0 # increment for each found subset
    for sub in all_subs:
      # iterate through each subset of a candidate, check if in vfi
      for v_set in vfi_sets:
        if set(sub) <= v_set:
          found_subs += 1
          break
    if found_subs == len(all_subs):
      next_cands.append(poss)
  return next_cands

# helper to round decimals to appropriate places
def round_floats(num):
  return f'{num:.4f}'

# --- BEGIN MAIN CODE ---

# add expected args, parse from command line
parser = argparse.ArgumentParser()
parser.add_argument('input_filename')
parser.add_argument('output_filename')
parser.add_argument('min_support_percentage', type=float)
parser.add_argument('min_confidence', type=float)
args = vars(parser.parse_args()) # store args, convert to dict

# map each transaction_id to a list of items
input_data = {}
# now open input, read in
with open('./' + args['input_filename']) as csvf:
  creader = csv.reader(csvf)
  for row in creader:
    # for each line, map trans id to list of items
    curr_id = row[0]
    # first arg is trans id, remaining are items
    if curr_id not in input_data.keys():
      input_data[curr_id] = []
    for item in row[1:]:
      input_data[curr_id].append(item)

# begin Apriori algorithm
# first build up list of all elements, use to look thru all possible combinations
# also build up list of all inputs in set notation
all_items = []
all_sets = []
for id in input_data.keys():
  curr = set()
  for item in input_data[id]:
    curr.add(item)
    all_items.append(item)
  all_sets.append(curr)

all_items = set(all_items) # convert to set to remove repeated elements
i = 1 # start with all combinations of length 1
next_frequent = True # determines when to terminate algorithm
# first run is all sets of size one, generate all combinations
combos = list(itertools.combinations(all_items, i))
all_vfis = [] # keep master list of verified frequent itemsets
all_rules = [] # master list of verified association rules
while (next_frequent):
  # find vfis + candidates
  prune_rv = apriori_prune(combos, i, all_sets, len(input_data.keys()), args['min_support_percentage'],
  args['min_confidence'])
  # index 0 of return = vfis, index 1 = next candidates
  all_vfis.append(prune_rv[0])
  # find association rules
  rules_rv = apriori_assoc(prune_rv[0], i, all_sets, len(input_data.keys()), args['min_support_percentage'],
  args['min_confidence'])
  if len(rules_rv.keys()) > 0:
    # if not empty, append rules
    all_rules.append(rules_rv)
  if len(prune_rv[1]) == 0:
    # when no next candidate itemsets generated, we stop
    next_frequent = False
  else:
    i += 1 # move on to next size if still candidates
    combos = prune_rv[1]

# sort vfis before printing to csv
sort_vfis = []
for type_vfi in all_vfis:
  sort_vfis.append({','.join(sorted(k.split(','))): v for k, v in type_vfi.items()})

# now begin printing to csv
with open('./' + args['output_filename'], 'w') as csvf:
  cwriter = csv.writer(csvf)
  for type_vfi in sort_vfis:
    sort_keys = sorted(type_vfi.keys())
    for skey in sort_keys:
      row = []
      row.append('S')
      row.append(round_floats(type_vfi[skey]))
      skey = skey.split(',')
      for letter in skey:
        row.append(letter)
      cwriter.writerow(row)
  # now print rules
  for rule in all_rules:
    sort_rules = sorted(rule.keys())
    for srule in sort_rules:
      row = []
      row.append('R')
      # support, then confidence
      row.append(round_floats(rule[srule][0]))
      row.append(round_floats(rule[srule][1]))
      # print out each char
      srule = srule.split(',')
      for letter in srule:
        row.append(letter)
      cwriter.writerow(row)