Advertisement
Guest User

Untitled

a guest
Jun 24th, 2019
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.29 KB | None | 0 0
  1. # udfs are applied to col elements, not to cols
  2. # but they take col as args (pyspark.sql.Column)
  3. # and return (pyspark.sql.types)
  4. from pyspark.sql import functions as F
  5.  
  6. >>> def f(c1, c2):
  7. return str(c1) + str(c2)
  8. >>> fu = F.udf(f, StringType())
  9. >>> df = spark.createDataFrame([(1, 'a'), (1, 'b'), (2, 'd')], ['c1', 'c2'])
  10. >>> df.withColumn('test', fu(df.c1, df.c2)).show()
  11. +---+---+----+
  12. | c1| c2|test|
  13. +---+---+----+
  14. | 1| a| 1a|
  15. | 1| b| 1b|
  16. | 2| d| 2d|
  17. +---+---+----+
  18.  
  19.  
  20. ## MAP ##
  21. # mapping occurs with a withColumn, see UDF
  22. >>> @F.udf(returnType=T.StringType())
  23. def f(c1, c2):
  24. return str(c1) + str(c2)
  25. >>> df = spark.createDataFrame([(1, '123'), (1, '90'), (2, '45')], ['c1', 'c2'])
  26. >>> df.withColumn('test', f(df.c1, df.c2)).show()
  27. +---+---+----+
  28. | c1| c2|test|
  29. +---+---+----+
  30. | 1|123|1123|
  31. | 1| 90| 190|
  32. | 2| 45| 245|
  33. +---+---+----+
  34.  
  35.  
  36.  
  37.  
  38. ## FILTER ##
  39.  
  40. @F.udf(T.BooleanType())
  41. def g(c1, c2):
  42. return int(c1) > 1 & int(c2) % 2 == 0
  43. df = spark.createDataFrame([(1, '123'), (1, '90'), (2, '45')], ['c1', 'c2'])
  44. df.filter(g(df.c1, df.c2)).show()
  45. '''
  46. +---+---+
  47. | c1| c2|
  48. +---+---+
  49. | 1| 90|
  50. +---+---+
  51. '''
  52.  
  53. # CURRYING
  54. # la curryfication désigne la transformation d'une fonction à plusieurs
  55. # arguments en une fonction à un argument qui retourne une fonction sur
  56. # le reste des arguments.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement