Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- [("MLGuy42", "andrew@gmail.com", "123-4567"),
- ("CS229DungeonMaster", "123-4567", "ml@stanford.edu"),
- ("Doomguy", "john@oculus.com", "carmack@gmail.com"),
- ("andrew26", "andrew@gmail.com", "mlguy@gmail.com")]
- [[0, 1, 3], [2]]
- def find_duplicates(user_info):
- results = list()
- seen = dict()
- for i, user in enumerate(user_info):
- first_seen = True
- key_info = None
- for info in user:
- if info in seen:
- first_seen = False
- key_info = info
- break
- if first_seen:
- results.append([i])
- pos = len(results) - 1
- else:
- index = seen[key_info]
- results[index].append(i)
- pos = index
- for info in user:
- seen[info] = pos
- return results
- def find_duplicates(user_info):
- results = list()
- seen = dict()
- for i, user in enumerate(user_info):
- for info in user:
- if info in seen:
- index = seen[info]
- results[index].append(i)
- pos = index
- break
- else:
- results.append([i])
- pos = len(results) - 1
- for info in user:
- seen[info] = pos
- return results
- from random import randrange
- from timeit import timeit
- MAXVALUE = 1000
- for a in range(5): #I wanted to make sure I checked 5 to make sure I don't get a outlier data set that effects my ability to use timeit reasonably.
- user_info = [[randrange(MAXVALUE) for i in range(3)] for _ in range(1000)]
- print(timeit(lambda: find_duplicates(user_info), number=10000))
Add Comment
Please, Sign In to add comment