Advertisement
Guest User

Untitled

a guest
Jul 27th, 2022
600
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.85 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import uuid
  4.  
  5. # generate dummy data
  6. colnames = ["keyword_id", "url_hash"]
  7. keywords = [str(uuid.uuid4()) for _ in range(100)]
  8. url_hash = [str(uuid.uuid4()) for _ in range(35)]
  9.  
  10. # keyword_id, url_hash
  11. df = pd.DataFrame({
  12.     "keyword_id": np.random.choice(keywords, 1000),
  13.     "url_hash": np.random.choice(url_hash, 1000)
  14. })
  15.  
  16.  
  17. table = {}
  18. def find(val):
  19.     while val != table[val]:
  20.         table[val] = table[table[val]]
  21.         val = table[val]
  22.     return table[val]
  23.  
  24. def union(vals):
  25.     for val in vals:
  26.         table[val] = min(vals)
  27.  
  28.  
  29. for url, key_ids in df.groupby("url_hash")["keyword_id"]:
  30.     table[url] = url
  31.     for k_id in key_ids:
  32.         table[k_id] = url
  33.  
  34. for key_id, urls in df.groupby("keyword_id")["url_hash"]:
  35.     union(urls.values)
  36.  
  37.  
  38. df["group"] = df["keyword_id"].apply(find)
  39.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement