Advertisement
Guest User

Untitled

a guest
Jan 16th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 0.92 KB | None | 0 0
  1. import spark.implicits._
  2. import org.apache.spark.sql.Dataset
  3.  
  4. final case class Foo(k:String)
  5. final case class Bar(k:String, b:Boolean)
  6.  
  7. val dsFoo:Dataset[Foo] = spark.createDataset(List(Foo("a"), Foo("x")))
  8. //val dsBar:Dataset[Bar] = spark.createDataset(List(Bar("a", true), Bar("a", false), Bar("x", true)))
  9. val dsBar:Dataset[Bar] = spark.createDataset(List(Bar("a", true), Bar("a", false))) //note no element to link to Foo("x")
  10.  
  11. val dsFooBar = dsFoo.joinWith(dsBar, dsFoo("k") === dsBar("k"), "leftOuter").map(tpl => (tpl._1, Option(tpl._2)))
  12.  
  13. val dsGrouped = dsFooBar
  14.     .groupByKey(_._1) // we want Foos with all their Bars
  15.     //.mapValues(_._2) can't project out the Option here because spark doesn't like the top level option
  16.  
  17. dsGrouped.mapGroups( (foo:Foo, bars:Iterator[(Foo,Option[Bar])]) => foo -> bars.toList.map(_._2).collect{ case Some(x) => x}).show //Baaaam explodes because (I think) Foo("x") has no Bar
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement