Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package xxx.DataScience.CompensationStudy
- import org.apache.spark._
- import org.apache.log4j._
- import org.apache.spark.SparkContext
- import org.apache.spark.SparkContext._
- import org.apache.spark.SparkConf
- import org.apache.spark.sql.Row
- import org.apache.spark.sql.functions.{col, udf}
- import org.apache.spark.sql.types._
- import org.apache.spark.SparkContext
- import org.apache.spark.sql.SQLContext
- object CompensationAnalysis {
- case class GetDF(profil_date:String, profil_pays:String, param_tarif2:String, param_tarif3:String, dt_titre:String, dt_langues:String,
- dt_diplomes:String, dt_experience:String, dt_formation:String, dt_outils:String, comp_applications:String,
- comp_interventions:String, comp_competence:String)
- def main(args: Array[String]) {
- Logger.getLogger("org").setLevel(Level.ERROR)
- val conf = new SparkConf().setAppName("CompensationAnalysis ")
- val sc = new SparkContext(conf)
- val sqlContext = new org.apache.spark.sql.SQLContext(sc)
- import sqlContext.implicits._
- val lines = sc.textFile("C:/Users/../Downloads/CompensationStudy.csv").flatMap { l =>
- l.split(",") match {
- case field: Array[String] if field.size > 13 => Some(field(0), field(1), field(2), field(3), field(4), field(5), field(6), field(7), field(8), field(9), field(10), field(11), field(12))
- case field: Array[String] if field.size == 1 => Some((field(0), "default value"))
- case _ => None
- }
- }
- val summary = lines.collect().map(x => GetDF(x("profil_date"), x("profil_pays"), x("param_tarif2"), x("param_tarif3"), x("dt_titre"), x("dt_langues"), x("dt_diplomes"), x("dt_experience"), x("dt_formation"), x("dt_outils"), x("comp_applications"), x("comp_interventions"), x("comp_competence")))
- val sum_df = summary.toDF()
- df.printSchema
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement