Untitled

Step 1: Login into hortonworks sandbox
Ssh root@127.0.0.1 -p 2222
Password:maria_dev

Step 2: Create 2 Hive tables with same structure as users table (inside test2 database) on MySql - users_orc and users_tmp (use beeline or Hive View via Ambari)
create table users_orc (id BIGINT, email STRING, name STRING, create_date STRING, update_date STRING)
CLUSTERED BY (id) into 5 buckets
stored as ORC
TBLPROPERTIES ("transactional"="true");

create table users_tmp (id BIGINT, email STRING, name STRING, create_date STRING, update_date STRING)
CLUSTERED BY (id) into 5 buckets
stored as ORC
TBLPROPERTIES ("transactional"="true");

Step 3: Start spark shell with mysql connector
cd /usr/hdp/2.6/spark2/bin
./spark-shell --jars ./../jars/mysql-connector-java-5.1.44-bin.jar
--Spark script:
import org.apache.spark.sql.hive.HiveContext;
val mysql_props = new java.util.Properties;
mysql_props.setProperty("user","root");
mysql_props.setProperty("password","Clair2016");
val connection="jdbc:mysql://10.0.2.2:3306/test2";
val sqlContext = new HiveContext(sc);
val users=sqlContext.read.format("orc").jdbc(connection,"users",mysql_props);
users.show;
users.write.format("orc").mode("overwrite").saveAsTable("users_table");
val hiveCtx = new HiveContext(sc)
val topRows=hiveCtx.sql("SELECT id, email, name FROM users_table LIMIT 10");
topRows.show;
hiveCtx.sql("insert overwrite table users_tmp (select * from users_table)");

Step 4: Do merge in Hive (use beeline or Hive View)
set hive.enforce.bucketing = true;
merge into users_orc using (select * from users_tmp) sub on sub.id=users_orc.id
when matched then update set name=sub.name, email=sub.email, update_date=sub.update_date
when not matched then insert values (sub.id, sub.email, sub.name, sub.create_date, sub.update_date);