Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def intersect_rdds(rdd1, rdd2):
- """
- Returns a RDD being the intersection of both rdds, equivalent to a SQL inner join
- The condition used for the join here is the first column of the original RDDs
- """
- rdd1 = sc.parallelize([((0,0), 1), ((0,1), 2), ((1,0), 3), ((1,1), 4)])
- rdd2 = sc.parallelize([((0,1), 3), ((1,1), 0)])
- return rdd1.cogroup(rdd2).filter(lambda x: x[1][0] and x[1][1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement