Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Step 1: Login into hortonworks sandbox
- Ssh root@127.0.0.1 -p 2222
- Password:maria_dev
- Step 2: Create 2 Hive tables with same structure as users table (inside test2 database) on MySql - users_orc and users_tmp (use beeline or Hive View via Ambari)
- create table users_orc (id BIGINT, email STRING, name STRING, create_date STRING, update_date STRING)
- CLUSTERED BY (id) into 5 buckets
- stored as ORC
- TBLPROPERTIES ("transactional"="true");
- create table users_tmp (id BIGINT, email STRING, name STRING, create_date STRING, update_date STRING)
- CLUSTERED BY (id) into 5 buckets
- stored as ORC
- TBLPROPERTIES ("transactional"="true");
- Step 3: Start spark shell with mysql connector
- cd /usr/hdp/2.6/spark2/bin
- ./spark-shell --jars ./../jars/mysql-connector-java-5.1.44-bin.jar
- --Spark script:
- import org.apache.spark.sql.hive.HiveContext;
- val mysql_props = new java.util.Properties;
- mysql_props.setProperty("user","root");
- mysql_props.setProperty("password","Clair2016");
- val connection="jdbc:mysql://10.0.2.2:3306/test2";
- val sqlContext = new HiveContext(sc);
- val users=sqlContext.read.format("orc").jdbc(connection,"users",mysql_props);
- users.show;
- users.write.format("orc").mode("overwrite").saveAsTable("users_table");
- val hiveCtx = new HiveContext(sc)
- val topRows=hiveCtx.sql("SELECT id, email, name FROM users_table LIMIT 10");
- topRows.show;
- hiveCtx.sql("insert overwrite table users_tmp (select * from users_table)");
- Step 4: Do merge in Hive (use beeline or Hive View)
- set hive.enforce.bucketing = true;
- merge into users_orc using (select * from users_tmp) sub on sub.id=users_orc.id
- when matched then update set name=sub.name, email=sub.email, update_date=sub.update_date
- when not matched then insert values (sub.id, sub.email, sub.name, sub.create_date, sub.update_date);
Add Comment
Please, Sign In to add comment