Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Dataset<Row> fullData = sparkSession.read().json("some_path");
- ExecutorService executor = Executors.newFixedThreadPool(10);
- List<Runnable> tasks = Lists.newArrayList();
- for (int i = 1; i <= 50; i++) {
- final int x = i;
- tasks.add(() -> {
- Dataset<Row> subset_1 = fullData.filter(length(col("name")).equalTo(x));
- Dataset<Row> subset_2 = fullData.filter(length(col("name")).equalTo(x));
- Dataset<Row> result = subset_1.join(subset_2, ...);
- log.info("Res size is " + result.count()); // force Spark do the join operation
- });
- }
- CompletableFuture<?>[] futures = tasks.stream()
- .map(task -> CompletableFuture.runAsync(task, executor))
- .toArray(CompletableFuture[]::new);
- CompletableFuture.allOf(futures).join();
- executor.shutdown();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement