Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // val employeesPath = "/Users/itversity/Research/data/hr_db/employees/part-00000"
- val employeesPath = "/mnt/c/data/hr_db/employees/part-00000"
- /* val employees = spark.
- read.
- text(employeesPath).
- withColumn("employee_id", split($"value", "\t")(0).cast("int")).
- withColumn("first_name", split($"value", "\t")(1)).
- withColumn("last_name", split($"value", "\t")(2)).
- withColumn("email", split($"value", "\t")(3)).
- withColumn("phone_number", split($"value", "\t")(4)).
- withColumn("hire_date", split($"value", "\t")(5)).
- withColumn("job_id", split($"value", "\t")(6)).
- withColumn("salary", split($"value", "\t")(7).cast("float")).
- withColumn("commission_pct", split($"value", "\t")(8).cast("int")).
- withColumn("manager_id", split($"value", "\t")(9).cast("int")).
- withColumn("department_id", split($"value", "\t")(10).cast("int")).
- drop($"value")
- */
- val employeesRaw = spark.
- read.
- text(employeesPath).
- as[String]
- val employees = employeesRaw.map(rec => {
- val r = rec.split("\t")
- (r(0).toInt, r(1), r(2), r(3),
- r(4), r(5), r(6), r(7).toFloat,
- r(8), r(9), r(10)
- )
- }).toDF("employee_id", "first_name", "last_name", "email",
- "phone_number", "hire_date", "job_id", "salary",
- "commission_pct", "manager_id", "department_id")
- spark.conf.set("spark.sql.shuffle.partitions", "2")
- import org.apache.spark.sql.expressions.Window
- val spec = Window.partitionBy("department_id")
- val employeesSalary = employees.
- select("employee_id", "salary", "department_id").
- withColumn("department_salary_expense", sum($"salary").over(spec)).
- orderBy($"department_id", $"salary".desc)
Add Comment
Please, Sign In to add comment