Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark import SparkConf, SparkContext
- from pyspark.sql import SQLContext, Row
- #Setting the application name and creating the spark context and sql context objects.
- conf = SparkConf().setAppName("SparkSQL")
- sc = SparkContext(conf = conf)
- sqlContext = SQLContext(sc)
- #The URL where the mysql database is hosted
- jdbcUrl = 'jdbcUrl'
- #The list of all the dataframes which will be created
- dataframes_list = ['df_departments', 'df_categories', 'df_products', 'df_orders', 'df_order_items']
- #The list of all the tables from which the data has to be loaded from
- sql_tables = ['departments', 'categories', 'products', 'orders', 'order_items']
- for i in range (0,5):
- dataframes_list[i] = sqlContext.read.format('jdbc').options(url=jdbcUrl, user='user', password='password',dbtable=sql_tables[i]).load()
- dataframes_list[i].registerTempTable(sql_tables[i])
- #Executing the query which returns the revenue generated by each department in a month
- agg_results = sqlContext.sql("SELECT substr(o.order_date, 1,7) order_month, d.department_name, sum(oi.order_item_subtotal) FROM departments d join categories c ON d.department_id = c.category_department_id JOIN products p ON c.category_id = p.product_category_id JOIN order_items oi ON p.product_id = oi.order_item_product_id JOIN orders o ON o.order_id = oi.order_item_order_id GROUP BY substr(o.order_date, 1,7), d.department_name ORDER BY order_month, department_name")
- for res in agg_results.collect():
- print(res)
- #Sample results:
- '''
- Row(order_month=u'2013-07', department_name=u'Golf', _c2=101087.26000000005)
- Row(order_month=u'2013-07', department_name=u'Outdoors', _c2=20877.519999999993)
- Row(order_month=u'2013-07', department_name=u'Testing Merge', _c2=5622.149999999998)
- Row(order_month=u'2013-07', department_name=u'data', _c2=163108.2700000002)
- Row(order_month=u'2013-07', department_name=u'fanshoppy', _c2=389162.09)
- Row(order_month=u'2013-07', department_name=u'footwear', _c2=84624.92000000001)
- Row(order_month=u'2013-08', department_name=u'Golf', _c2=391535.7299999997)
- Row(order_month=u'2013-08', department_name=u'Outdoors', _c2=82849.71999999997)
- Row(order_month=u'2013-08', department_name=u'Testing Merge', _c2=23719.610000000008)
- Row(order_month=u'2013-08', department_name=u'data', _c2=598981.2099999998)
- Row(order_month=u'2013-08', department_name=u'fanshoppy', _c2=1410128.6399999997)
- Row(order_month=u'2013-08', department_name=u'footwear', _c2=321443.78999999986)
- Row(order_month=u'2013-09', department_name=u'Golf', _c2=388137.44)
- Row(order_month=u'2013-09', department_name=u'Outdoors', _c2=84080.7)
- Row(order_month=u'2013-09', department_name=u'Testing Merge', _c2=24377.450000000004)
- Row(order_month=u'2013-09', department_name=u'data', _c2=615739.5500000002)
- Row(order_month=u'2013-09', department_name=u'fanshoppy', _c2=1452927.6799999995)
- Row(order_month=u'2013-09', department_name=u'footwear', _c2=369264.4499999999)
- #The execution of the query using different engines has taken the following times
- 1. Executing the query using mySQL engine : 0.49 seconds
- 2. Executing the query on hive using Map Reduce engine which exececutes the query by running Map Reduce jobs in background: 52.085 seconds
- 3. Executing the query on hive using Tez as execution engine in background: 25.63 seconds
- 4. Executing the query using SparkSQL by loading the tables as Data Frames, it uses Apache Spark catalyst optimizer for running the query: 4.2 seconds
- '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement