Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -----------------------
- EXTERNAL HIVE TABLE ON PARQUET
- -----------------------
- create external table table6(
- id string,
- db1field string,
- created string,
- empid string
- )
- stored as parquet
- location '/table6';
- import spark.implicits._
- import org.apache.spark.rdd._
- import org.apache.spark.sql._
- import org.apache.spark.sql.types._
- val rowsRdd: RDD[Row] = sc.parallelize(
- Seq(
- Row("first1updatedagain", "1.1", "2018-10-12","12232"),
- Row("secondupdated1again", "2.2", "2018-10-12","23244"),
- Row("third2updatedagain", "3.3", "2018-10-12","54434"),
- Row("withNulls2updatedagain", null, null,null),
- Row("withEmptyStrs2updated", "", "",null),
- Row("withBadValues4updated", "NotADouble", "NotATimestamp","wrong")
- )
- )
- val schema = new StructType()
- .add(StructField("id",StringType,true))
- .add(StructField("db1field", StringType, true))
- .add(StructField("created", StringType, true))
- .add(StructField("empid", StringType, true))
- val df = spark.createDataFrame(rowsRdd, schema)
- println("before writing")
- df.show(false)
- df.write.mode("overwrite").parquet("/table6")
- val dfRead = spark.read.parquet("/table6")
- println("reading the parquet ")
- dfRead.show(false)
- println("Using SQL query")
- spark.sql("REFRESH TABLE table6")
- spark.sql("select * from table6").show(false)
- import org.apache.spark.sql.SparkSession
- val sparksession=SparkSession.builder().enableHiveSupport().getOrCreate()
- val df=sparksession.sql("select * from table6 ")
- df.show(false)
- val df2=sparksession.sqlContext.sql("select * from table6 ")
- df2.show(false)
Add Comment
Please, Sign In to add comment