Ranking in PySpark

  1. from pyspark.sql.functions import col, desc
  2. from pyspark.sql import Window
  3. import pyspark.sql.functions as psf
  5. # Create a dataframe for just 2008 data
  6. df08 = df_renamed_dest.filter(col("flight_year") == 2008)
  8. # Create a dataframe for total inbound passengers of each airport
  9. ranked = df08.groupBy("destination_airport_name").sum("passengers")
  11. # Use Window to order and rank by the number of inbound passengers
  12. windowA = Window.orderBy(psf.desc("sum(passengers)"))
  13. dfr = ranked.withColumn("Rank", psf.dense_rank().over(windowA))
