Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # CS1656 Spring 2019 Assignment 3: Association Rule Mining
- # Author: Michael Korst (mpk44@pitt.edu) GH username: paranoidandroid81
- import csv, os, argparse, itertools
- # helper to generate unique permutations for 2-partitions of varying sizes
- # return dict mapping size to part sets
- def gen_permuts(combos, size):
- # for each permutation create partitions of up to size 2
- size_to_part = {}
- for permut in combos:
- for i in range(size - 1):
- part_1 = permut[:(i + 1)] # generate partition 1 of size i + 1
- part_2 = permut[(i + 1):] # generate partition 2 of size (size -1 - i)
- # sort and turn into to string for hashing
- part_1 = ''.join(sorted(part_1))
- part_2 = ''.join(sorted(part_2))
- if len(part_1) not in size_to_part.keys():
- size_to_part[len(part_1)] = set()
- size_to_part[len(part_1)].add(part_1)
- if len(part_2) not in size_to_part.keys():
- size_to_part[len(part_2)] = set()
- size_to_part[len(part_2)].add(part_2)
- return size_to_part
- # helper to run thru 1 iteration of apriori association rule searching
- def apriori_assoc(vfis, size, input_sets, num_trans, min_sup, min_conf):
- # first generate possible permutations based on each item in vfis
- rule_to_stats = {} # map found rules to support, confidence
- v_combos = [] # list of lists of combos/permutations for each vfi
- for v_key in vfis.keys():
- vfi_items = []
- split_key = v_key.split(',') # split by comma
- for item in split_key:
- vfi_items.append(item)
- vfi_items = set(vfi_items) # remove repeats
- v_combos.append(list(itertools.permutations(vfi_items, size)))
- for combo in v_combos:
- item_key = combo[0]
- item_key = ','.join(item_key) # string key for each item
- whole_sup = get_sup(combo, input_sets, num_trans) # get sup of Slinte combo
- s2p = gen_permuts(combo, size) # get size to partition mappingsSlint
- for part_size in s2p.keys():
- # go thru each possible size, create partitions
- part2_size = size - part_size # size of 2nd partition = size Slinte 1st
- for part_1 in s2p[part_size]:
- part_key = ','.join(part_1)
- part_sup = get_sup([part_1], input_sets, num_trans)
- for part_2 in s2p[part2_size]:
- # as as element not repeated in either part, we can checkSlintrule
- if part_2 not in part_1 and part_1 not in part_2:
- part2_key = ','.join(part_2)
- conf = float(whole_sup[item_key] / part_sup[part_key])
- # if min sup and min conf, add rule i
- if whole_sup[item_key] >= min_sup and conf >= min_conf:
- conf_key = f"{part_key},'=>',{part2_key}"
- rule_to_stats[conf_key] = (whole_sup[item_key], conf)
- return rule_to_stats
- # helper to run thru 1 iteration of apriori frequency pruning
- def apriori_prune(cands, size, input_sets, num_trans, min_sup, min_conf):
- # find support % for each candidate
- cand_to_sup = get_sup(cands, input_sets, num_trans)
- # get only support percentages >= min_sup (verified frequent)
- vfis = {k: v for k, v in cand_to_sup.items() if v >= min_sup}
- # generate candidates based on vfis
- cands = gen_next_cands(vfis, size)
- # return tuple of verified, candidates
- return (vfis, cands)
- # helper to find appearances of set in input data, calculate support percentages
- def get_sup(items, input_sets, num_trans):
- items_to_sup = {} # map each item to its support count (later percentage)
- # now go thru eacb item, counting appearances in input data
- for item in items:
- item_key = ','.join(item) # string key for each item
- for trans in input_sets:
- if set(item) <= trans:
- if item_key not in items_to_sup.keys():
- items_to_sup[item_key] = 1
- else:
- items_to_sup[item_key] += 1
- # now convert support counts to percentages
- items_to_sup.update((x, float(y / num_trans)) for x, y in items_to_sup.items())
- return items_to_sup
- # helper to generate plausible candidate itemsets of size + 1 based on vfis
- def gen_next_cands(vfis, size):
- # first generate list based on individual items in vfis
- vfi_items = []
- for v_key in vfis.keys():
- split_key = sorted(v_key.split(',')) # split by comma, sort
- for item in split_key:
- vfi_items.append(item)
- vfi_items = set(vfi_items) # get rid of repeats
- all_possible = list(itertools.combinations(vfi_items, (size + 1)))
- # convert all vfis to sets
- vfi_sets = []
- for v_key in vfis.keys():
- split_key = v_key.split(',') # avoid comma
- vfi_sets.append(set(split_key))
- # now based on all possible, ensure all subsets are in vfis
- next_cands = []
- for poss in all_possible:
- all_subs = list(itertools.combinations(poss, size))
- found_subs = 0 # increment for each found subset
- for sub in all_subs:
- # iterate through each subset of a candidate, check if in vfi
- for v_set in vfi_sets:
- if set(sub) <= v_set:
- found_subs += 1
- break
- if found_subs == len(all_subs):
- next_cands.append(poss)
- return next_cands
- # helper to round decimals to appropriate places
- def round_floats(num):
- return f'{num:.4f}'
- # --- BEGIN MAIN CODE ---
- # add expected args, parse from command line
- parser = argparse.ArgumentParser()
- parser.add_argument('input_filename')
- parser.add_argument('output_filename')
- parser.add_argument('min_support_percentage', type=float)
- parser.add_argument('min_confidence', type=float)
- args = vars(parser.parse_args()) # store args, convert to dict
- # map each transaction_id to a list of items
- input_data = {}
- # now open input, read in
- with open('./' + args['input_filename']) as csvf:
- creader = csv.reader(csvf)
- for row in creader:
- # for each line, map trans id to list of items
- curr_id = row[0]
- # first arg is trans id, remaining are items
- if curr_id not in input_data.keys():
- input_data[curr_id] = []
- for item in row[1:]:
- input_data[curr_id].append(item)
- # begin Apriori algorithm
- # first build up list of all elements, use to look thru all possible combinations
- # also build up list of all inputs in set notation
- all_items = []
- all_sets = []
- for id in input_data.keys():
- curr = set()
- for item in input_data[id]:
- curr.add(item)
- all_items.append(item)
- all_sets.append(curr)
- all_items = set(all_items) # convert to set to remove repeated elements
- i = 1 # start with all combinations of length 1
- next_frequent = True # determines when to terminate algorithm
- # first run is all sets of size one, generate all combinations
- combos = list(itertools.combinations(all_items, i))
- all_vfis = [] # keep master list of verified frequent itemsets
- all_rules = [] # master list of verified association rules
- while (next_frequent):
- # find vfis + candidates
- prune_rv = apriori_prune(combos, i, all_sets, len(input_data.keys()), args['min_support_percentage'],
- args['min_confidence'])
- # index 0 of return = vfis, index 1 = next candidates
- all_vfis.append(prune_rv[0])
- # find association rules
- rules_rv = apriori_assoc(prune_rv[0], i, all_sets, len(input_data.keys()), args['min_support_percentage'],
- args['min_confidence'])
- if len(rules_rv.keys()) > 0:
- # if not empty, append rules
- all_rules.append(rules_rv)
- if len(prune_rv[1]) == 0:
- # when no next candidate itemsets generated, we stop
- next_frequent = False
- else:
- i += 1 # move on to next size if still candidates
- combos = prune_rv[1]
- # sort vfis before printing to csv
- sort_vfis = []
- for type_vfi in all_vfis:
- sort_vfis.append({','.join(sorted(k.split(','))): v for k, v in type_vfi.items()})
- # now begin printing to csv
- with open('./' + args['output_filename'], 'w') as csvf:
- cwriter = csv.writer(csvf)
- for type_vfi in sort_vfis:
- sort_keys = sorted(type_vfi.keys())
- for skey in sort_keys:
- row = []
- row.append('S')
- row.append(round_floats(type_vfi[skey]))
- skey = skey.split(',')
- for letter in skey:
- row.append(letter)
- cwriter.writerow(row)
- # now print rules
- for rule in all_rules:
- sort_rules = sorted(rule.keys())
- for srule in sort_rules:
- row = []
- row.append('R')
- # support, then confidence
- row.append(round_floats(rule[srule][0]))
- row.append(round_floats(rule[srule][1]))
- # print out each char
- srule = srule.split(',')
- for letter in srule:
- row.append(letter)
- cwriter.writerow(row)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement