Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- S = sc.parallelize([
- {'start': datetime(2000,10,11), 'end': datetime(2001,01,01)},
- {'start': datetime(2001,01,01), 'end': datetime(2002,01,01)},
- {'start': datetime(2002,01,01), 'end': datetime(2003,01,01)},
- {'start': datetime(2003,01,01), 'end': datetime(2004,01,01)},
- {'start': datetime(2004,01,01), 'end': datetime(2005,01,01)},
- ])
- T = sc.parallelize([
- {'timestamp': datetime(2000,06,05)},
- {'timestamp': datetime(2002,06,05)},
- {'timestamp': datetime(2003,06,05)},
- {'timestamp': datetime(2002,07,05)},
- {'timestamp': datetime(2010,07,05)},
- ])
- S = S.map(lambda r: ((r['start'], r['end']),r))
- T = T.map(lambda r: (r['timestamp'], r))
- join_condition = lambda s,t: t[0] < s < t[1]
- results = theta_join(S,T, join_condition).collect()
- for r in results:
- print r[1] #the original rows
- """
- ({u'timestamp': datetime.datetime(2002, 6, 5, 0, 0)}, {u'start': datetime.datetime(2002, 1, 1, 0, 0), u'end': datetime.datetime(2003, 1, 1, 0, 0)})
- ({u'timestamp': datetime.datetime(2003, 6, 5, 0, 0)}, {u'start': datetime.datetime(2003, 1, 1, 0, 0), u'end': datetime.datetime(2004, 1, 1, 0, 0)})
- ({u'timestamp': datetime.datetime(2002, 7, 5, 0, 0)}, {u'start': datetime.datetime(2002, 1, 1, 0, 0), u'end': datetime.datetime(2003, 1, 1, 0, 0)})
- """
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement