Advertisement
Guest User

Untitled

a guest
Jun 15th, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.77 KB | None | 0 0
  1. Dataset<Row> fullData = sparkSession.read().json("some_path");
  2. ExecutorService executor = Executors.newFixedThreadPool(10);
  3. List<Runnable> tasks = Lists.newArrayList();
  4. for (int i = 1; i <= 50; i++) {
  5. final int x = i;
  6. tasks.add(() -> {
  7. Dataset<Row> subset_1 = fullData.filter(length(col("name")).equalTo(x));
  8. Dataset<Row> subset_2 = fullData.filter(length(col("name")).equalTo(x));
  9. Dataset<Row> result = subset_1.join(subset_2, ...);
  10. log.info("Res size is " + result.count()); // force Spark do the join operation
  11. });
  12. }
  13. CompletableFuture<?>[] futures = tasks.stream()
  14. .map(task -> CompletableFuture.runAsync(task, executor))
  15. .toArray(CompletableFuture[]::new);
  16. CompletableFuture.allOf(futures).join();
  17. executor.shutdown();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement