Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- # some functions
- # =================================================
- def id_emails(raw):
- data = {}
- for id, email in raw:
- prev = data.get(id, set())
- prev.add(email)
- data[id] = prev
- return data
- def add2dict(res, n, id, email):
- if n not in res:
- res[n] = [[id, email]]
- else:
- res[n].append([id, email])
- def recur(n, id, email):
- add2dict(res, n, id, email)
- temp = {k: v for k, v in data.items() if k != id}
- for id_, email_ in temp.items():
- if email in email_:
- res[n].append([id_, email])
- email_ = email_ - {email}
- for em in email_:
- recur(n, id_, em)
- def check(raw, item):
- for v in raw.values():
- if v == sorted(item):
- return True
- else:
- return False
- def create_unique_data(res):
- i, raw = 0, {}
- for k, v in res.items():
- if i == 0:
- raw[i] = sorted(v)
- else:
- if check(raw, v) == False:
- raw[i] = sorted(v)
- i += 1
- return raw
- # =================================================
- # 0. user data
- raw = [['001', 'm001@com'],
- ['0404', 'other@com'],
- ['001', 'm100@com'],
- ['020', 'B020@com'],
- ['020', 'm001@com'],
- ['300', 'B020@com'],
- ]
- data = id_emails(raw)
- # data = {'001': {'m001@com', 'm100@com'},
- # '020': {'B020@com', 'm001@com'},
- # '300': {'B020@com'},
- # '0404': {'other@com'}}
- # 1. processing
- n, res = 0, {}
- for key in data.keys():
- for email in data[key]:
- recur(n, key, email)
- n += 1
- # 2. deleting duplicates
- uniq_data = create_unique_data(res)
- # # print result raw data:
- # for k, v in res.items():
- # print(k, v)
- # # print normal result:
- # for k, v in uniq_data.items():
- # print(k, v)
- # 3. converting
- df = pd.DataFrame.from_dict({(i,j): uniq_data[i][j]
- for i in uniq_data.keys()
- for j in range(len(uniq_data[i]))},
- orient='index')
- df = df.reset_index()
- df.columns = ['multiindex', 'id', 'email']
- df['group'] = df['multiindex'].apply(lambda x: x[0])
- df['number_in_group'] = df['multiindex'].apply(lambda x: x[1])
- # df = df[['group', 'id', 'email']]
- df
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement