Untitled

import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

val schema = StructType(Seq(
    StructField("service_id", StringType, true),
    StructField("date", StringType, true),
    StructField("exception_type", IntegerType, true)))

val cd = sc.textFile("hdfs://node1:9000/user/folakemi/stm/gtfs/staging/calendar_dates/calendar_dates_3.txt").map(c => {
    val fields = c.split(",")
    Row(fields(0), fields(1), fields(2).toInt)
})
cd.take(5).foreach(print)
val cdDF = spark.createDataFrame(cd, schema)
cdDF.printSchema()
cdDF.show()

/******************OUTPUT***********************/
 +-----------------+--------+--------------+ | service_id| date|exception_type| +-----------------+--------+--------------+ |18N-H50N000S-86-S|20181225| 2| |18N-H50N000S-86-S|20181226| 2| |18N-H50N000S-86-S|20190101| 2| |18N-H50N000S-86-S|20190102| 2| |18N-H54N000S-81-S|20181225| 2| |18N-H54N000S-81-S|20181226| 2| |18N-H54N000S-81-S|20190101| 2| |18N-H54N000S-81-S|20190102| 2| |18N-H55N000S-84-S|20181225| 2| |18N-H55N000S-84-S|20181226| 2| |18N-H55N000S-84-S|20190101| 2| |18N-H55N000S-84-S|20190102| 2| |18N-H56N000S-84-S|20181225| 2| |18N-H56N000S-84-S|20181226| 2| |18N-H56N000S-84-S|20190101| 2| |18N-H56N000S-84-S|20190102| 2| |18N-H57N000S-82-S|20181225| 2| |18N-H57N000S-82-S|20181226| 2| |18N-H57N000S-82-S|20190101| 2| |18N-H57N000S-82-S|20190102| 2| +-----------------+--------+--------------+ only showing top 20 rows import org.apache.spark.sql.types._ import org.apache.spark.sql.Row schema: org.apache.spark.sql.types.StructType = StructType(StructField(service_id,StringType,true), StructField(date,StringType,true), StructField(exception_type,IntegerType,true)) cd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[55] at map at <console>:33 cdDF: org.apache.spark.sql.DataFrame = [service_id: string, date: string ... 1 more field]

[18N-H50N000S-86-S,20181225,2][18N-H50N000S-86-S,20181226,2][18N-H50N000S-86-S,20190101,2][18N-H50N000S-86-S,20190102,2][18N-H54N000S-81-S,20181225,2]root |-- service_id: string (nullable = true) |-- date: string (nullable = true) |-- exception_type: integer (nullable = true)
/************************************************/

/***********************************Spark SQL******************************/
cdDF.createOrReplaceTempView("calendar_dates_3")
/*********************************************************************/
*********************Spark Chart**********************************/
%sql
select * from calendar_dates_3
/**************************************************/