Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.sql import Row
- from pyspark.sql import HiveContext
- from pyspark.sql.functions import udf
- from pyspark.context import SparkContext
- sc = SparkContext("local", "dict to col")
- hc = HiveContext(sc)
- data = hc.createDataFrame([Row(user_id=1, app_usage={'snapchat': 2, 'facebook': 10, 'gmail': 1}, active_hours={4: 1, 6: 11, 22: 1}),
- Row(user_id=2, app_usage={
- 'tinder': 100, 'zoosk': 3, 'hotmail': 2}, active_hours={6: 2, 18: 23, 23: 80}),
- Row(user_id=3, app_usage={'netflix': 50, 'facebook': 5, 'amazon': 10}, active_hours={10: 4, 19: 6, 20: 55})])
- data.show()
- rdd = data.select('app_usage', 'user_id').rdd.map(tuple)
- rdd.foreach(print)
- cols = sorted(list(rdd.map(lambda x: set(x[0].keys())).reduce(
- lambda acc, keys: acc | keys)))
- empty_value_fill = 0
- new_cols_data = rdd.map(lambda x: [x[1]] + list(map(lambda col: x[0][col]
- if col in x[0] else empty_value_fill, cols))).toDF(['user_id'] + cols)
- new_cols_data.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement