Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // ===========================================================================
- // Welcome!
- //
- // Log in to your Snowflake instance, create a new worksheet, then
- // copy and paste the contents of this file into your worksheet.
- //
- // Make sure you're using the ACCOUNTADMIN role before we get started
- // (see the context in the upper right corner of your worksheet)
- //
- // Next, head over to the Partner Connect page (link in the top right of the snowflake toolbar).
- //
- // We're going to start by setting up a Talend Stitch account using the built in Snowflake connection.
- // This will give us everything we need to start securely ingesting data.
- // ===========================================================================
- // ===========================================================================
- // Next, let's manually setup PowerBI similarly to how Partner Connect set
- // Talend Stitch up for us before.
- // ===========================================================================
- // Create the DB, user, role, and warehouse that powerbi will use to securely connect
- CREATE DATABASE IF NOT EXISTS POWERBI;
- CREATE USER IF NOT EXISTS POWERBI_USER_ACCOUNT PASSWORD="my super cool password." MUST_CHANGE_PASSWORD=false; // use your own password, dummy
- CREATE ROLE IF NOT EXISTS POWERBI_ROLE;
- CREATE WAREHOUSE IF NOT EXISTS POWERBI_WH WAREHOUSE_SIZE=XSMALL;
- // Attach permissions
- GRANT USAGE ON DATABASE POWERBI TO ROLE POWERBI_ROLE;
- GRANT USAGE ON WAREHOUSE POWERBI_WH TO ROLE POWERBI_ROLE;
- ALTER USER POWERBI_USER_ACCOUNT SET DEFAULT_ROLE=POWERBI_ROLE; // you must do this because the PowerBI connector doesn't let you specify a role.
- // ===========================================================================
- // ===========================================================================
- // Now, setup ingestion from the Stitch side of things and wait for data to flow.
- //
- // Let's explore the raw data.
- // ===========================================================================
- // Initial look at data
- SELECT * FROM PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA LIMIT 10;
- // let's make the table easier to use in queries
- SET SPOTIFY = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA';
- SELECT * FROM TABLE($SPOTIFY) LIMIT 10;
- // we can also set context to do this
- USE DATABASE PC_STITCH_DB;
- USE SCHEMA AWS_S3_CSV_BUCKET;
- SELECT * FROM SPOTIFY_DATA LIMIT 10;
- // NOTE: I prefer the variable approach for worksheets
- // where you'll be in multiple databases. It prevents mistakes
- // when running different parts of your worksheet
- // let's find out what the top songs are
- SELECT * FROM TABLE($SPOTIFY) ORDER BY STREAMS DESC LIMIT 10;
- // let's see what date ranges we're working with
- SELECT
- MIN(DATE) AS startDate,
- MAX(DATE) AS endDate
- FROM
- TABLE($SPOTIFY);
- // looks like we're essentially playing with top daily streams for all of 2017.
- // Let's see which artist had the most streams over the whole data set
- SELECT
- ARTIST, SUM(STREAMS) as streamCount
- FROM
- TABLE($SPOTIFY)
- GROUP BY
- ARTIST
- ORDER BY
- streamCount DESC
- LIMIT 10;
- // hmm, these numbers seem high. Maybe we need to explore those region codes
- SELECT DISTINCT REGION FROM TABLE($SPOTIFY);
- // maybe we should just use global? I think aggregating with regions AND global
- // is resulting in double counting. Lets find out using Drake's streams.
- SET globalDrakeSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION='global');
- SET sumOfRegionalDrakeStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION!='global');
- SELECT $globalDrakeSteams, $sumOfRegionalDrakeStreams;
- // what about Kendrick Lamar ("DAMN." came out in the spring of 2017 and it is a masterpiece)
- SET globalKendrickSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION='global');
- SET sumOfRegionalKendrickStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION!='global');
- SELECT $globalKendrickSteams, $sumOfRegionalKendrickStreams;
- // it's not exact, but close enough to convince me to filter by global from now on. (check with other artists if you like)
- // The easiest way to do that is with a view. Let's make one now
- CREATE VIEW IF NOT EXISTS
- PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL
- AS
- SELECT * FROM TABLE($SPOTIFY) WHERE REGION='global';
- // Now grab the view name as a variable
- SET SPOTIFY_GLOBAL = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL';
- // Cool, now we're working with some cleaner data.
- // Let's revisit our top artists.
- SELECT
- ARTIST, SUM(STREAMS) as streamCount
- FROM
- TABLE($SPOTIFY_GLOBAL)
- GROUP BY
- ARTIST
- ORDER BY
- streamCount DESC
- LIMIT 10;
- // now top songs
- SELECT
- ARTIST, TRACK_NAME, SUM(STREAMS) as streamCount
- FROM
- TABLE($SPOTIFY_GLOBAL)
- GROUP BY
- ARTIST, TRACK_NAME
- ORDER BY
- streamCount DESC
- LIMIT 10;
- // wow, Ed Sheeran was killing it in 2017. Good for him.
- //
- // I think we have enough context to maybe build a good BI table...
- // ===========================================================================
- // ===========================================================================
- // Now, let's send some clean, BI-ready data to our PowerBI DB.
- //
- // Our goal from here will be to transform the data in PC_STITCH_DB to a clean,
- // BI-ready form inside of the POWERBI database for analytics consumption
- // ===========================================================================
- // let's create our spotify schema in the powerbi db
- CREATE SCHEMA IF NOT EXISTS POWERBI.SPOTIFY;
- // I think it would be fun to see the top 5 songs every
- // month for US streaming.
- // start by creating a temp table with stream counts for each song by month.
- // We'll filter to only US data and we'll ignore the partial January 2018 data
- CREATE OR REPLACE TEMPORARY TABLE
- POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
- AS (
- SELECT
- ARTIST,
- TRACK_NAME,
- SUM(STREAMS) AS STREAM_COUNT,
- DATE_TRUNC(MONTH, DATE) AS MONTH
- FROM
- TABLE($SPOTIFY)
- WHERE
- REGION='us' AND YEAR(DATE) = 2017
- GROUP BY
- MONTH, ARTIST, TRACK_NAME
- );
- // now we'll get song ranks within each month using a nested
- // query with a window function
- CREATE OR REPLACE TABLE
- POWERBI.SPOTIFY.US_MONTHLY_TOP_5_SONGS
- AS (
- SELECT * FROM
- (
- SELECT
- *,
- RANK() OVER (PARTITION BY MONTH ORDER BY STREAM_COUNT DESC) AS MONTHLY_RANK
- FROM
- POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
- )
- WHERE
- MONTHLY_RANK <= 5
- );
- // now that we're finished with all that messiness, let's grant read access
- // on our new schema and table to our powerbi user
- GRANT USAGE ON SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
- GRANT SELECT ON ALL TABLES IN SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
- // Great job! You should be able to make some cool visuazliations in PBI now.
- // ===========================================================================
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement