Advertisement
Guest User

Untitled

a guest
Oct 17th, 2019
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.35 KB | None | 0 0
  1. >>> from pyspark.sql import SparkSession, functions as F
  2. >>> from pyspark import SparkConf
  3. >>> conf = SparkConf()
  4.  
  5. >>> spark = SparkSession.builder \
  6. .config(conf=conf) \
  7. .appName('Dataframe with Indexes') \
  8. .getOrCreate()
  9.  
  10.  
  11. # create a simple dataframe with two columns
  12. >>> data = [{'column1': 1, 'column2': 2}, {'column1': 15, 'column2': 21}]
  13. >>> df = spark.createDataFrame(data)
  14. >>> df.show()
  15. + - - - -+ - - - -+
  16. |column1|column2 |
  17. + - - - -+ - - - -+
  18. | 1 | 2 |
  19. | 15 | 21 |
  20. + - - - -+ - - - -+
  21.  
  22. # use zipWithIndex to add the indexes and then toDF to get back to a dataframe
  23. >>> rdd_df = df.rdd.zipWithIndex()
  24. >>> df_final = rdd_df.toDF()
  25. >>> df_final.show()
  26. +--------+---+
  27. | _1| _2|
  28. +--------+---+
  29. | [1, 2]| 0|
  30. |[15, 21]| 1|
  31. +--------+---+
  32.  
  33. # Let's inspect the result datatypes:
  34. >>> df_final
  35. DataFrame[_1: struct<column1:bigint,column2:bigint>, _2: bigint, index: bigint]
  36.  
  37. # and then expand _1 column into the two we had before:
  38. >>> df_final = df_final.withColumn('column1', df_final['_1'].getItem("column1"))
  39. >>> df_final = df_final.withColumn('column2', df_final['_1'].getItem("column2"))
  40.  
  41. # finally select the columns we need:
  42. >>> df_final.select('index', 'column1', 'column2').show()
  43. +-----+-------+-------+
  44. |index|column1|column2|
  45. +-----+-------+-------+
  46. | 0| 1| 2|
  47. | 1| 15| 21|
  48. +-----+-------+-------+
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement