Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __author__ = 'tanza'
- import random, time
- import networkx as nx
- def assert_correctness(users, correct_num_users, correct_num_emails):
- """Assert the correctness of the deduplication process
- :param: users is the deduplicated dictionary of users to assert correctness
- :param: correct_num_users is the expected number of unique users
- :param: correct_num_emails is the expected number of unique emails
- returns None -- output result is printed here instead
- Explanation: A dictionary of users is deduplicated if the set of unique users and emails found throughout
- the dictionary amount to the same amount of users we started with. This ensures we did not lose any information.
- Second, we require that no email found in any user is found in any other user. This ensures that the dupication
- clause is correct.
- """
- #Test of the Loss of Data Clause
- set_users = set([])
- set_emails = set([])
- for key in users:
- for item in key:
- set_users.add(item)
- for val in users[key]:
- set_emails.add(val)
- #Test the duplication clause
- violations_found = 0
- already_probed = {}
- for user in users:
- for other_user in users:
- if not user == other_user and not already_probed.get((user, other_user), False):
- already_probed[(user,other_user)] = True
- if len(set(list(users[user])).intersection(set(list(users[other_user])))) > 0:
- violations_found += 1
- #Pretty print deduplicated users
- print "\n--((List of Deduplicated Users))--"
- print " -- Num Users: " + str(len(set_users))
- print " -- Num Emails: " + str(len(set_emails))
- pretty_print_dict(users)
- #Print out report
- print "\n--((Assertion Evaluation))--"
- print " -- Users look unique and correct!" if len(set_users) == correct_num_users else " -- These users don't look right."
- print " -- Emails look unique and correct!" if len(set_emails) == correct_num_emails else " -- These emails don't look right."
- print " -- Duplication Violations found: " + str(violations_found)
- def gen_random_users(N = 10, tiers = [0.70, 0.80, 0.90, 0.95, 0.98, 0.99, 0.999], V = 1):
- """generates a random dictionary of users and emails
- :param: N controls the number of users to be generated
- :param: tiers controls the probabilities of increasing numbers of emails per user
- :param: V controls the variety of the emails generated. Higher V = less chance of duplicate
- returns users - a list of random users with random emails
- """
- users = {}
- set_emails = set([])
- variety = V*N/10
- for i in range(1,N+1):
- #gen random list of emails
- f = random.random()
- for t,tier in enumerate(tiers):
- if f < tier:
- n = t+1
- break
- emails = []
- for j in range(n):
- emails.append(random.randint(1,10+variety))
- emails = list(set(emails))
- emails = ['e'+str(e) for e in emails]
- for email in emails:
- set_emails.add(email)
- users[tuple(['u'+str(i),])] = emails
- return users, N, len(set_emails)
- def pretty_print_dict(users):
- """Pretty prints a dictionary of users
- :param: users is a dictionary to pretty print
- returns None
- """
- max_n = 10
- cur_n = 0
- len_n = len(users)
- max_e = 5
- max_u = 5
- s = ""
- s += " {\n"
- for user in users:
- if (cur_n < max_n and cur_n < len_n):
- cur_n += 1
- if len(user) < max_u:
- s += " " + str(user) + ": "
- else:
- s += " (" + ','.join(["'" + str(u) + "'" for u in list(user)][:max_u]) + " + " + str(len(user)-max_u) + " more...)" + ": "
- s += "["
- cur_e = 0
- len_e = len(users[user])
- for e in users[user]:
- if (cur_e < max_e and cur_e < len_e):
- cur_e += 1
- s += "'" + str(e) + "'" + ","
- else:
- break
- if cur_e < len_e:
- s += " + " + str(len_e - cur_e) + " more,"
- s = s[:-1]
- s += "],\n"
- else:
- break
- if cur_n < len_n:
- s += " < ... " + str(len_n - cur_n) + " more keys>,\n"
- s = s[:-2] + "\n"
- s += " }"
- print s
- def brute_force_dedup(users):
- """brute force approach iterates over pairs of users, searching for intersections of common emails
- :param: users is a dictionary of users and their associated emails
- returns dedup_users, a list of deduplicated users
- """
- #Begin process for deduplication - O(N^3) algorithm if changes made every iteration. At best O(N^2) with no duplications.
- dedup_users = {}
- changes_made = True
- while(changes_made): # how many times will changes be made? Possibly |users|-1 times but depends on number of duplications.
- changes_made = False
- already_probed = {}
- for u in users:
- dedup_users[u] = users[u]
- for v in users:
- if not u == v:
- if u != v and not already_probed.get((u,v), False): # O(|users|*|users|*0.5 - |users|) = O(|users|*|users|)
- already_probed[(u,v)] = True
- if len(set(users[u]).intersection(set(users[v]))) > 0:
- try:
- del dedup_users[u]
- except:
- pass
- try:
- del dedup_users[v]
- except:
- pass
- dedup_users[tuple(list(set(u).union(set(v))))] = list(set(users[u]).union(set(users[v])))
- changes_made = True
- if changes_made:
- users = {key: dedup_users[key] for key in dedup_users}
- return dedup_users
- def graph_dedup(users):
- """returns a deduplication of the users
- :param: users is a dictionary of users with associated emails
- returns dedup_users, a dictionary of deduplicated users
- Explanation: Uses graph theory to construct a graph using the inverted users-emails dictionary. The connected
- components of this graph represent deduplicated users
- """
- #invert the users dictionary - O(|emails|) -> O(N)
- emails = {}
- for user in users:
- for email in users[user]:
- try:
- emails[email].add(user)
- except:
- emails[email] = set([user])
- emails = {key: list(emails[key]) for key in emails}
- #build graph -- network x library - O(|emails|) -> O(N)
- G = nx.Graph()
- for email in emails:
- if len(emails[email]) > 1:
- for u1,u2 in zip(emails[email][:-1], emails[email][1:]):
- G.add_edge(u1, u2, email=email)
- else:
- G.add_edge(emails[email][0], emails[email][0], email=email)
- dedup_users = {}
- ccs = nx.connected_components(G) #O(V+E) -> O(N)
- #O(N) -- every user is hit only once. Add complexity of set unions
- for cc in ccs:
- user_list = tuple([c[0] for c in cc])
- email_list = set()
- for user in user_list:
- email_list = email_list.union(set(list(users[tuple([user])]))) # -> O(|users[user]|)
- dedup_users[user_list] = tuple(list(email_list))
- return dedup_users
- #""" test case 1
- users = {
- tuple(['u1',]): ['e1', 'e2'],
- tuple(['u2',]): ['e1', 'e3'],
- tuple(['u3',]): ['e4'],
- tuple(['u4',]): ['e2', 'e5'],
- tuple(['u5',]): ['e6'],
- tuple(['u6',]): ['e7', 'e8'],
- tuple(['u7',]): ['e8'],
- tuple(['u8',]): ['e1', 'e9']
- }
- highest_email = 9
- N = 8
- #"""
- """ test case 2
- users = {('u2',): ['e9'], ('u5',): ['e7'], ('u4',): ['e11', 'e9', 'e3', 'e5'], ('u9',): ['e6'], ('u7',): ['e1', 'e4'], ('u8',): ['e8'], ('u6',): ['e6'], ('u1',): ['e6'], ('u3',): ['e8', 'e9', 'e10', 'e2'], ('u10',): ['e10']}
- highest_email = 11
- N = 10
- """
- """ test case 3 = random generation. Generic random means the number of emails can be low or high, so it may be slow or fast.
- users,N,highest_email = gen_random_users(N = 12)
- """
- #""" test case 4 = random generation with very low number of emails. This case is easier to run.
- #users,N,highest_email = gen_random_users(N = 12, tiers = [0.95, 0.99, 0.999])
- #"""
- #""" test case 5 = random generation with very high number of emails, but low variety, so there's a lot of intersection cases. This case will take much longer to run
- #users,N,highest_email = gen_random_users(N = 12000, tiers = [0.35, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80 ,0.85, 0.90, 0.95, 0.99, 0.999])
- #"""
- #""" test case 6 = random generation with very high number of emails, but wide variety meaning less intersection.
- #users,N,highest_email = gen_random_users(N = 12, tiers = [0.35, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80 ,0.85, 0.90, 0.95, 0.99, 0.999], V = 10)
- #"""
- #""" test case 7 = random generation with very low number of emails and high variety. This case should be the fastest.
- #users,N,highest_email = gen_random_users(N = 12, tiers = [0.95, 0.99, 0.9999], V = 10)
- #"""
- #""" test case 8 = same as test case 7 but higher N=num users
- #users,N,highest_email = gen_random_users(N = 5000, tiers = [0.95, 0.99, 0.9999], V = 10)
- #"""
- #Initial pretty print of original list of users
- print "\n--((Original List of Users))--"
- print " -- Num Users = " + str(N)
- print " -- Num Emails = " + str(highest_email)
- pretty_print_dict(users)
- """
- #deduplication - brute force method - O(N^3)
- t_start = time.time()
- dedup_users = brute_force_dedup({key: users[key] for key in users})
- assert_correctness(dedup_users, N, highest_email)
- print "\n"
- print "Finished! That happened in " + str("%.3f" % (time.time() - t_start)) + " seconds."
- print "\n"
- """
- #deduplication - graph theory approach - O(N^2)
- t_start = time.time()
- dedup_users = graph_dedup({key: users[key] for key in users})
- assert_correctness(dedup_users, N, highest_email)
- print "\n"
- print "Finished! That happened in " + str("%.3f" % (time.time() - t_start)) + " seconds."
- print "\n"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement