Untitled

val partitions = 5;   // this value depends on data and volumes. Will be different in every case.
val df = sqlContext.read.json(“URI://path/to/parquet/files/")
df.createOrReplaceTempView("df")
val df_output = spark
  .sql("SELECT DISTINCT * FROM df") // this removes duplicates. If it's not needed, simply remove this line
  .coalesce(partitions)
df_output.write.parquet("URI://path/to/destination")