daily pastebin goal
46%
SHARE
TWEET

Untitled

a guest Mar 20th, 2019 240 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from pyspark.sql import Row
  2. from pyspark.sql import HiveContext
  3. from pyspark.sql.functions import udf
  4. from pyspark.context import SparkContext
  5.  
  6.  
  7. sc = SparkContext("local", "dict to col")
  8. hc = HiveContext(sc)
  9.  
  10. data = hc.createDataFrame([Row(user_id=1, app_usage={'snapchat': 2, 'facebook': 10, 'gmail': 1}, active_hours={4: 1, 6: 11, 22: 1}),
  11.  
  12.                            Row(user_id=2, app_usage={
  13.                                'tinder': 100, 'zoosk': 3, 'hotmail': 2}, active_hours={6: 2, 18: 23, 23: 80}),
  14.  
  15.                            Row(user_id=3, app_usage={'netflix': 50, 'facebook': 5, 'amazon': 10}, active_hours={10: 4, 19: 6, 20: 55})])
  16.  
  17. data.show()
  18. rdd = data.select('app_usage', 'user_id').rdd.map(tuple)
  19. rdd.foreach(print)
  20. cols = sorted(list(rdd.map(lambda x: set(x[0].keys())).reduce(
  21.     lambda acc, keys: acc | keys)))
  22. empty_value_fill = 0
  23. new_cols_data = rdd.map(lambda x: [x[1]] + list(map(lambda col: x[0][col]
  24.                                                     if col in x[0] else empty_value_fill, cols))).toDF(['user_id'] + cols)
  25. new_cols_data.show()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top