Advertisement
Guest User

Untitled

a guest
Oct 8th, 2016
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.15 KB | None | 0 0
  1. from pyspark import SparkConf, SparkContext
  2. from pyspark.sql import SQLContext, Row
  3.  
  4. conf = SparkConf().setAppName("PlaySQL")
  5. sc = SparkContext(conf = conf)
  6. sqlContext = SQLContext(sc)
  7.  
  8. jdbcUrl = 'jdbc:mysql://ip-172-31-13-154.ec2.internal:3306/retail_db'
  9.  
  10. dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
  11. sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']
  12.  
  13. for i in range (0,5):
  14. dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='retail_dba', password='itversity',dbtable=sql_tables[i]).load()
  15. dataframes_list[i].registerTempTable(sql_tables[i])
  16.  
  17. agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")
  18.  
  19. for res in agg_results.collect():
  20. print(res)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement