Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import uuid
- # generate dummy data
- colnames = ["keyword_id", "url_hash"]
- keywords = [str(uuid.uuid4()) for _ in range(100)]
- url_hash = [str(uuid.uuid4()) for _ in range(35)]
- # keyword_id, url_hash
- df = pd.DataFrame({
- "keyword_id": np.random.choice(keywords, 1000),
- "url_hash": np.random.choice(url_hash, 1000)
- })
- table = {}
- def find(val):
- while val != table[val]:
- table[val] = table[table[val]]
- val = table[val]
- return table[val]
- def union(vals):
- for val in vals:
- table[val] = min(vals)
- for url, key_ids in df.groupby("url_hash")["keyword_id"]:
- table[url] = url
- for k_id in key_ids:
- table[k_id] = url
- for key_id, urls in df.groupby("keyword_id")["url_hash"]:
- union(urls.values)
- df["group"] = df["keyword_id"].apply(find)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement