Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.sql.functions import col, lit
- # df is a Spark DataFrame: DataFrame[basket: string, product: string, customer: string, store: string]
- baskets_tally = df.groupBy().agg(countDistinct(col('basket'))).collect()[0][0]
- df = df.groupBy(col('basket')).count().withColumnRenamed('count', 'tally_of_products_per_basket')
- df = df.groupBy("tally_of_products_per_basket") \
- .count() \
- .withColumnRenamed('count', 'tally_of_baskets_containing_products_tally') \
- .orderBy(col("tally_of_products_per_basket").asc())
- df.withColumn(
- 'fraction_of_baskets_containing_products_tally',
- col('tally_of_baskets_containing_products_tally') / lit(baskets_tally)
- )
- ### To illustrate the calculated data:
- # df.orderBy(col('tally_of_products_per_basket').asc())
- # .select('tally_of_products_per_basket', 'fraction_of_baskets_containing_products_tally').head(10)
- #
- # returns this histogram:
- #
- #
- #
Add Comment
Please, Sign In to add comment