Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import spark.implicits._
- import org.apache.spark.sql.Dataset
- final case class Foo(k:String)
- final case class Bar(k:String, b:Boolean)
- val dsFoo:Dataset[Foo] = spark.createDataset(List(Foo("a"), Foo("x")))
- //val dsBar:Dataset[Bar] = spark.createDataset(List(Bar("a", true), Bar("a", false), Bar("x", true)))
- val dsBar:Dataset[Bar] = spark.createDataset(List(Bar("a", true), Bar("a", false))) //note no element to link to Foo("x")
- val dsFooBar = dsFoo.joinWith(dsBar, dsFoo("k") === dsBar("k"), "leftOuter").map(tpl => (tpl._1, Option(tpl._2)))
- val dsGrouped = dsFooBar
- .groupByKey(_._1) // we want Foos with all their Bars
- //.mapValues(_._2) can't project out the Option here because spark doesn't like the top level option
- dsGrouped.mapGroups( (foo:Foo, bars:Iterator[(Foo,Option[Bar])]) => foo -> bars.toList.map(_._2).collect{ case Some(x) => x}).show //Baaaam explodes because (I think) Foo("x") has no Bar
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement