Advertisement
Guest User

Untitled

a guest
Sep 18th, 2016
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.27 KB | None | 0 0
  1. __author__ = 'tanza'
  2.  
  3.  
  4.  
  5. import random, time
  6. import networkx as nx
  7.  
  8.  
  9.  
  10. def assert_correctness(users, correct_num_users, correct_num_emails):
  11.     """Assert the correctness of the deduplication process
  12.       :param: users is the deduplicated dictionary of users to assert correctness
  13.       :param: correct_num_users is the expected number of unique users
  14.       :param: correct_num_emails is the expected number of unique emails
  15.  
  16.       returns None -- output result is printed here instead
  17.  
  18.       Explanation: A dictionary of users is deduplicated if the set of unique users and emails found throughout
  19.       the dictionary amount to the same amount of users we started with.  This ensures we did not lose any information.
  20.       Second, we require that no email found in any user is found in any other user.  This ensures that the dupication
  21.       clause is correct.
  22.    """
  23.  
  24.     #Test of the Loss of Data Clause
  25.     set_users = set([])
  26.     set_emails = set([])
  27.     for key in users:
  28.         for item in key:
  29.             set_users.add(item)
  30.         for val in users[key]:
  31.             set_emails.add(val)
  32.  
  33.  
  34.     #Test the duplication clause
  35.     violations_found = 0
  36.     already_probed = {}
  37.     for user in users:
  38.         for other_user in users:
  39.             if not user == other_user and not already_probed.get((user, other_user), False):
  40.                 already_probed[(user,other_user)] = True
  41.                 if len(set(list(users[user])).intersection(set(list(users[other_user])))) > 0:
  42.                     violations_found += 1
  43.  
  44.  
  45.     #Pretty print deduplicated users
  46.     print "\n--((List of Deduplicated Users))--"
  47.     print " -- Num Users:  " + str(len(set_users))
  48.     print " -- Num Emails: " + str(len(set_emails))
  49.     pretty_print_dict(users)
  50.  
  51.  
  52.     #Print out report
  53.     print "\n--((Assertion Evaluation))--"
  54.     print " -- Users look unique and correct!" if len(set_users) == correct_num_users else " -- These users don't look right."
  55.     print " -- Emails look unique and correct!" if len(set_emails) == correct_num_emails else " -- These emails don't look right."
  56.     print " -- Duplication Violations found: " + str(violations_found)
  57.  
  58.  
  59.  
  60. def gen_random_users(N = 10, tiers = [0.70, 0.80, 0.90, 0.95, 0.98, 0.99, 0.999], V = 1):
  61.     """generates a random dictionary of users and emails
  62.       :param: N controls the number of users to be generated
  63.       :param: tiers controls the probabilities of increasing numbers of emails per user
  64.       :param: V controls the variety of the emails generated.  Higher V = less chance of duplicate
  65.  
  66.       returns users - a list of random users with random emails
  67.    """
  68.  
  69.     users = {}
  70.     set_emails = set([])
  71.     variety = V*N/10
  72.     for i in range(1,N+1):
  73.         #gen random list of emails
  74.         f = random.random()
  75.         for t,tier in enumerate(tiers):
  76.             if f < tier:
  77.                 n = t+1
  78.                 break
  79.         emails = []
  80.         for j in range(n):
  81.             emails.append(random.randint(1,10+variety))
  82.         emails = list(set(emails))
  83.         emails = ['e'+str(e) for e in emails]
  84.         for email in emails:
  85.             set_emails.add(email)
  86.         users[tuple(['u'+str(i),])] = emails
  87.     return users, N, len(set_emails)
  88.  
  89. def pretty_print_dict(users):
  90.     """Pretty prints a dictionary of users
  91.       :param: users is a dictionary to pretty print
  92.  
  93.       returns None
  94.    """
  95.  
  96.     max_n = 10
  97.     cur_n = 0
  98.     len_n = len(users)
  99.  
  100.     max_e = 5
  101.     max_u = 5
  102.  
  103.     s = ""
  104.     s += "  {\n"
  105.  
  106.     for user in users:
  107.         if (cur_n < max_n and cur_n < len_n):
  108.             cur_n += 1
  109.             if len(user) < max_u:
  110.                 s += "     " + str(user) + ": "
  111.             else:
  112.                 s += "      (" + ','.join(["'" + str(u) + "'" for u in list(user)][:max_u]) + " + " + str(len(user)-max_u) + " more...)" + ": "
  113.  
  114.             s += "["
  115.             cur_e = 0
  116.             len_e = len(users[user])
  117.             for e in users[user]:
  118.                 if (cur_e < max_e and cur_e < len_e):
  119.                     cur_e += 1
  120.                     s += "'" + str(e) + "'" + ","
  121.                 else:
  122.                     break
  123.             if cur_e < len_e:
  124.                 s += " + " + str(len_e - cur_e) + " more,"
  125.             s = s[:-1]
  126.             s += "],\n"
  127.         else:
  128.             break
  129.     if cur_n < len_n:
  130.         s += "     < ... " + str(len_n - cur_n) + " more keys>,\n"
  131.     s = s[:-2] + "\n"
  132.     s += "  }"
  133.     print s
  134.  
  135.  
  136. def brute_force_dedup(users):
  137.     """brute force approach iterates over pairs of users, searching for intersections of common emails
  138.    :param: users is a dictionary of users and their associated emails
  139.  
  140.    returns dedup_users, a list of deduplicated users
  141.    """
  142.  
  143.     #Begin process for deduplication - O(N^3) algorithm if changes made every iteration.  At best O(N^2) with no duplications.
  144.     dedup_users = {}
  145.     changes_made = True
  146.     while(changes_made):  # how many times will changes be made?  Possibly |users|-1 times but depends on number of duplications.
  147.         changes_made = False
  148.         already_probed = {}
  149.         for u in users:
  150.             dedup_users[u] = users[u]
  151.             for v in users:
  152.                 if not u == v:
  153.                     if u != v and not already_probed.get((u,v), False):   # O(|users|*|users|*0.5 - |users|) = O(|users|*|users|)
  154.                         already_probed[(u,v)] = True
  155.                         if len(set(users[u]).intersection(set(users[v]))) > 0:
  156.                             try:
  157.                                 del dedup_users[u]
  158.                             except:
  159.                                 pass
  160.                             try:
  161.                                 del dedup_users[v]
  162.                             except:
  163.                                 pass
  164.                             dedup_users[tuple(list(set(u).union(set(v))))] = list(set(users[u]).union(set(users[v])))
  165.                             changes_made = True
  166.         if changes_made:
  167.             users = {key: dedup_users[key] for key in dedup_users}
  168.  
  169.     return dedup_users
  170.  
  171.  
  172. def graph_dedup(users):
  173.     """returns a deduplication of the users
  174.    :param: users is a dictionary of users with associated emails
  175.  
  176.    returns dedup_users, a dictionary of deduplicated users
  177.  
  178.    Explanation: Uses graph theory to construct a graph using the inverted users-emails dictionary.  The connected
  179.    components of this graph represent deduplicated users
  180.    """
  181.  
  182.     #invert the users dictionary - O(|emails|) -> O(N)
  183.     emails = {}
  184.     for user in users:
  185.         for email in users[user]:
  186.             try:
  187.                 emails[email].add(user)
  188.             except:
  189.                 emails[email] = set([user])
  190.     emails = {key: list(emails[key]) for key in emails}
  191.  
  192.  
  193.     #build graph -- network x library - O(|emails|) -> O(N)
  194.     G = nx.Graph()
  195.     for email in emails:
  196.         if len(emails[email]) > 1:
  197.             for u1,u2 in zip(emails[email][:-1], emails[email][1:]):
  198.                 G.add_edge(u1, u2, email=email)
  199.         else:
  200.             G.add_edge(emails[email][0], emails[email][0], email=email)
  201.  
  202.  
  203.     dedup_users = {}
  204.     ccs = nx.connected_components(G) #O(V+E) -> O(N)
  205.  
  206.     #O(N) -- every user is hit only once.  Add  complexity of set unions
  207.     for cc in ccs:
  208.         user_list = tuple([c[0] for c in cc])
  209.         email_list = set()
  210.         for user in user_list:
  211.             email_list = email_list.union(set(list(users[tuple([user])]))) # -> O(|users[user]|)
  212.         dedup_users[user_list] = tuple(list(email_list))
  213.     return dedup_users
  214.  
  215.  
  216.  
  217. #""" test case 1
  218. users = {
  219.     tuple(['u1',]): ['e1', 'e2'],
  220.     tuple(['u2',]): ['e1', 'e3'],
  221.     tuple(['u3',]): ['e4'],
  222.     tuple(['u4',]): ['e2', 'e5'],
  223.     tuple(['u5',]): ['e6'],
  224.     tuple(['u6',]): ['e7', 'e8'],
  225.     tuple(['u7',]): ['e8'],
  226.     tuple(['u8',]): ['e1', 'e9']
  227. }
  228. highest_email = 9
  229. N = 8
  230. #"""
  231.  
  232. """ test case 2
  233. users = {('u2',): ['e9'], ('u5',): ['e7'], ('u4',): ['e11', 'e9', 'e3', 'e5'], ('u9',): ['e6'], ('u7',): ['e1', 'e4'], ('u8',): ['e8'], ('u6',): ['e6'], ('u1',): ['e6'], ('u3',): ['e8', 'e9', 'e10', 'e2'], ('u10',): ['e10']}
  234. highest_email = 11
  235. N = 10
  236. """
  237.  
  238. """ test case 3 = random generation.  Generic random means the number of emails can be low or high, so it may be slow or fast.
  239. users,N,highest_email = gen_random_users(N = 12)
  240. """
  241.  
  242. #""" test case 4 = random generation with very low number of emails.  This case is easier to run.
  243. #users,N,highest_email = gen_random_users(N = 12, tiers = [0.95, 0.99, 0.999])
  244. #"""
  245.  
  246.  
  247. #""" test case 5 = random generation with very high number of emails, but low variety, so there's a lot of intersection cases.  This case will take much longer to run
  248. #users,N,highest_email = gen_random_users(N = 12000, tiers = [0.35, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80 ,0.85, 0.90, 0.95, 0.99, 0.999])
  249. #"""
  250.  
  251. #""" test case 6 = random generation with very high number of emails, but wide variety meaning less intersection.
  252. #users,N,highest_email = gen_random_users(N = 12, tiers = [0.35, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80 ,0.85, 0.90, 0.95, 0.99, 0.999], V = 10)
  253. #"""
  254.  
  255. #""" test case 7 = random generation with very low number of emails and high variety.  This case should be the fastest.
  256. #users,N,highest_email = gen_random_users(N = 12, tiers = [0.95, 0.99, 0.9999], V = 10)
  257. #"""
  258.  
  259. #""" test case 8 = same as test case 7 but higher N=num users
  260. #users,N,highest_email = gen_random_users(N = 5000, tiers = [0.95, 0.99, 0.9999], V = 10)
  261. #"""
  262.  
  263.  
  264. #Initial pretty print of original list of users
  265. print "\n--((Original List of Users))--"
  266. print " -- Num Users =  " + str(N)
  267. print " -- Num Emails = " + str(highest_email)
  268. pretty_print_dict(users)
  269.  
  270.  
  271. """
  272. #deduplication - brute force method - O(N^3)
  273. t_start = time.time()
  274. dedup_users = brute_force_dedup({key: users[key] for key in users})
  275. assert_correctness(dedup_users, N, highest_email)
  276. print "\n"
  277. print "Finished!  That happened in " + str("%.3f" % (time.time() - t_start)) + " seconds."
  278. print "\n"
  279. """
  280.  
  281. #deduplication - graph theory approach - O(N^2)
  282. t_start = time.time()
  283. dedup_users = graph_dedup({key: users[key] for key in users})
  284. assert_correctness(dedup_users, N, highest_email)
  285. print "\n"
  286. print "Finished!  That happened in " + str("%.3f" % (time.time() - t_start)) + " seconds."
  287. print "\n"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement