Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- mysql -u retail_dba -p
- sqoop-import --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
- --username retail_dba \
- --password cloudera \
- --table orders \
- --target-dir /user/cloudera/teja_arun/ep01/orders \
- --compress \
- --compression-codec "org.apache.hadoop.io.compress.SnappyCodec" \
- --as-avrodatafile
- sqoop-import --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
- --username retail_dba \
- --password cloudera \
- --table order_items \
- --target-dir /user/cloudera/teja_arun/ep01/order_items \
- --compress \
- --compression-codec org.apache.hadoop.io.compress.SnappyCodec \
- --as-avrodatafile
- from pyspark import Row,HiveContext,SparkContext,SparkConf
- conf=SparkConf().setAppName("ep01").setMaster("yarn-client")
- sc=SparkContext(conf=conf)
- sqlContext=HiveContext(sc)
- ordersDF=sqlContext.read.load("/user/cloudera/teja_arun/ep01/orders","com.databricks.spark.avro")
- orderItemsDF=sqlContext.read.load("/user/cloudera/teja_arun/ep01/order_items","com.databricks.spark.avro")
- ordersDF.registerTempTable("orders")
- orderItemsDF.registerTempTable("order_items")
- rder_Date , Order_status, total_orders, total_amount. In plain english, please find total orders and total amount per status per day. The result should be sorted by order date in descending, order status in ascending and total amount in descending and total orders in ascending YYYY-MM-DD
- sqlResult=sqlContext.sql("select to_date(from_unixtime(cast(order_date/1000 as bigint))) order_formatted_date,order_status,count(order_id) total_count,cast(sum(order_item_subtotal) as decimal(10,2)) order_sum from orders o join order_items oi on o.order_id=oi.order_item_order_id group by to_date(from_unixtime(cast(order_date/1000 as bigint))),o.order_status order by order_formatted_date desc, order_status")
- sqlContext.setConf("spark.sql.parquet.compression.codec","gzip")
- sqlResult.write.parquet("/user/cloudera/teja_arun/ep01/solutions/parquet-gzip")
- sqlContext.setConf("spark.sql.parquet.compression.codec","snappy")
- sqlResult.write.parquet("/user/cloudera/teja_arun/solutions/parquet-snappy")
- sqlResult.map(lambda rec: str(rec[0])+","+str(rec[1])+","+str(rec[2])+","str(rec[3])).saveAsTextFile("/user/cloudera/teja_arun/ep01/solutions/csv")
- create result_one (order_date date,order_status varchar(255),total_count int,order_sum decimal float) constraint pk_date_status primary key (order_date,order_status)
- sqoop-export \
- --table result_one \
- --connect jdbc:mysql://quickstart.cloudera:3306/retail_db \
- --username retail_dba \
- --password cloudera \
- --export-dir "/user/cloudera/teja_arun/ep01/solutions/csv"
- --columns "order_date,order_status,total_count,order_sum"
Add Comment
Please, Sign In to add comment