Guest User

Untitled

a guest
Jul 17th, 2018
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.88 KB | None | 0 0
  1. from pyspark.sql.functions import col, lit
  2. # df is a Spark DataFrame: DataFrame[basket: string, product: string, customer: string, store: string]
  3. baskets_tally = df.groupBy().agg(countDistinct(col('basket'))).collect()[0][0]
  4. df = df.groupBy(col('basket')).count().withColumnRenamed('count', 'tally_of_products_per_basket')
  5. df = df.groupBy("tally_of_products_per_basket") \
  6. .count() \
  7. .withColumnRenamed('count', 'tally_of_baskets_containing_products_tally') \
  8. .orderBy(col("tally_of_products_per_basket").asc())
  9. df.withColumn(
  10. 'fraction_of_baskets_containing_products_tally',
  11. col('tally_of_baskets_containing_products_tally') / lit(baskets_tally)
  12. )
  13. ### To illustrate the calculated data:
  14. # df.orderBy(col('tally_of_products_per_basket').asc())
  15. # .select('tally_of_products_per_basket', 'fraction_of_baskets_containing_products_tally').head(10)
  16. #
  17. # returns this histogram:
  18. #
  19. #
  20. #
Add Comment
Please, Sign In to add comment