Advertisement
Guest User

Untitled

a guest
Mar 20th, 2019
1,366
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.05 KB | None | 0 0
  1. from pyspark.sql import Row
  2. from pyspark.sql import HiveContext
  3. from pyspark.sql.functions import udf
  4. from pyspark.context import SparkContext
  5.  
  6.  
  7. sc = SparkContext("local", "dict to col")
  8. hc = HiveContext(sc)
  9.  
  10. data = hc.createDataFrame([Row(user_id=1, app_usage={'snapchat': 2, 'facebook': 10, 'gmail': 1}, active_hours={4: 1, 6: 11, 22: 1}),
  11.  
  12. Row(user_id=2, app_usage={
  13. 'tinder': 100, 'zoosk': 3, 'hotmail': 2}, active_hours={6: 2, 18: 23, 23: 80}),
  14.  
  15. Row(user_id=3, app_usage={'netflix': 50, 'facebook': 5, 'amazon': 10}, active_hours={10: 4, 19: 6, 20: 55})])
  16.  
  17. data.show()
  18. rdd = data.select('app_usage', 'user_id').rdd.map(tuple)
  19. rdd.foreach(print)
  20. cols = sorted(list(rdd.map(lambda x: set(x[0].keys())).reduce(
  21. lambda acc, keys: acc | keys)))
  22. empty_value_fill = 0
  23. new_cols_data = rdd.map(lambda x: [x[1]] + list(map(lambda col: x[0][col]
  24. if col in x[0] else empty_value_fill, cols))).toDF(['user_id'] + cols)
  25. new_cols_data.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement