Untitled

// val employeesPath = "/Users/itversity/Research/data/hr_db/employees/part-00000"
val employeesPath = "/mnt/c/data/hr_db/employees/part-00000"
/* val employees = spark.
  read.
  text(employeesPath).
  withColumn("employee_id", split($"value", "\t")(0).cast("int")).
  withColumn("first_name", split($"value", "\t")(1)).
  withColumn("last_name", split($"value", "\t")(2)).
  withColumn("email", split($"value", "\t")(3)).
  withColumn("phone_number", split($"value", "\t")(4)).
  withColumn("hire_date", split($"value", "\t")(5)).
  withColumn("job_id", split($"value", "\t")(6)).
  withColumn("salary", split($"value", "\t")(7).cast("float")).
  withColumn("commission_pct", split($"value", "\t")(8).cast("int")).
  withColumn("manager_id", split($"value", "\t")(9).cast("int")).
  withColumn("department_id", split($"value", "\t")(10).cast("int")).
  drop($"value")
*/

val employeesRaw = spark.
  read.
  text(employeesPath).
  as[String]

val employees = employeesRaw.map(rec => {
  val r = rec.split("\t")
  (r(0).toInt, r(1), r(2), r(3),
   r(4), r(5), r(6), r(7).toFloat,
   r(8), r(9), r(10)
  )
}).toDF("employee_id", "first_name", "last_name", "email",
        "phone_number", "hire_date", "job_id", "salary",
        "commission_pct", "manager_id", "department_id")

spark.conf.set("spark.sql.shuffle.partitions", "2")

import org.apache.spark.sql.expressions.Window

val spec = Window.partitionBy("department_id")

val employeesSalary = employees.
  select("employee_id", "salary", "department_id").
  withColumn("department_salary_expense", sum($"salary").over(spec)).
  orderBy($"department_id", $"salary".desc)