Untitled

// ===========================================================================
// Welcome!
//
// Log in to your Snowflake instance, create a new worksheet, then
// copy and paste the contents of this file into your worksheet.
//
// Make sure you're using the ACCOUNTADMIN role before we get started
// (see the context in the upper right corner of your worksheet)
//
// Next, head over to the Partner Connect page (link in the top right of the snowflake toolbar).
//
// We're going to start by setting up a Talend Stitch account using the built in Snowflake connection.
// This will give us everything we need to start securely ingesting data.
// ===========================================================================


// ===========================================================================
// Next, let's manually setup PowerBI similarly to how Partner Connect set
// Talend Stitch up for us before.
// ===========================================================================
// Create the DB, user, role, and warehouse that powerbi will use to securely connect
CREATE DATABASE IF NOT EXISTS POWERBI;
CREATE USER IF NOT EXISTS POWERBI_USER_ACCOUNT PASSWORD="my super cool password." MUST_CHANGE_PASSWORD=false; // use your own password, dummy
CREATE ROLE IF NOT EXISTS POWERBI_ROLE;
CREATE WAREHOUSE IF NOT EXISTS POWERBI_WH WAREHOUSE_SIZE=XSMALL;

// Attach permissions
GRANT USAGE ON DATABASE POWERBI TO ROLE POWERBI_ROLE;
GRANT USAGE ON WAREHOUSE POWERBI_WH TO ROLE POWERBI_ROLE;
ALTER USER POWERBI_USER_ACCOUNT SET DEFAULT_ROLE=POWERBI_ROLE; // you must do this because the PowerBI connector doesn't let you specify a role.
// ===========================================================================


// ===========================================================================
// Now, setup ingestion from the Stitch side of things and wait for data to flow.
//
// Let's explore the raw data.
// ===========================================================================
// Initial look at data
SELECT * FROM PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA LIMIT 10;

// let's make the table easier to use in queries
SET SPOTIFY = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA';
SELECT * FROM TABLE($SPOTIFY) LIMIT 10;

// we can also set context to do this
USE DATABASE PC_STITCH_DB;
USE SCHEMA AWS_S3_CSV_BUCKET;
SELECT * FROM SPOTIFY_DATA LIMIT 10;

// NOTE: I prefer the variable approach for worksheets
// where you'll be in multiple databases. It prevents mistakes
// when running different parts of your worksheet

// let's find out what the top songs are
SELECT * FROM TABLE($SPOTIFY) ORDER BY STREAMS DESC LIMIT 10;

// let's see what date ranges we're working with
SELECT
    MIN(DATE) AS startDate,
    MAX(DATE) AS endDate
FROM
    TABLE($SPOTIFY);

// looks like we're essentially playing with top daily streams for all of 2017.
// Let's see which artist had the most streams over the whole data set
SELECT
    ARTIST, SUM(STREAMS) as streamCount
FROM
    TABLE($SPOTIFY)
GROUP BY
    ARTIST
ORDER BY
    streamCount DESC
LIMIT 10;

// hmm, these numbers seem high. Maybe we need to explore those region codes
SELECT DISTINCT REGION FROM TABLE($SPOTIFY);

// maybe we should just use global? I think aggregating with regions AND global
// is resulting in double counting. Lets find out using Drake's streams.
SET globalDrakeSteams =         (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION='global');
SET sumOfRegionalDrakeStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION!='global');
SELECT $globalDrakeSteams, $sumOfRegionalDrakeStreams;

// what about Kendrick Lamar ("DAMN." came out in the spring of 2017 and it is a masterpiece)
SET globalKendrickSteams =         (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION='global');
SET sumOfRegionalKendrickStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION!='global');
SELECT $globalKendrickSteams, $sumOfRegionalKendrickStreams;

// it's not exact, but close enough to convince me to filter by global from now on. (check with other artists if you like)
// The easiest way to do that is with a view. Let's make one now
CREATE VIEW IF NOT EXISTS
  PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL
AS
  SELECT * FROM TABLE($SPOTIFY) WHERE REGION='global';

// Now grab the view name as a variable
SET SPOTIFY_GLOBAL = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL';

// Cool, now we're working with some cleaner data.
// Let's revisit our top artists.
SELECT
    ARTIST, SUM(STREAMS) as streamCount
FROM
    TABLE($SPOTIFY_GLOBAL)
GROUP BY
    ARTIST
ORDER BY
    streamCount DESC
LIMIT 10;

// now top songs
SELECT
    ARTIST, TRACK_NAME, SUM(STREAMS) as streamCount
FROM
    TABLE($SPOTIFY_GLOBAL)
GROUP BY
    ARTIST, TRACK_NAME
ORDER BY
    streamCount DESC
LIMIT 10;

// wow, Ed Sheeran was killing it in 2017. Good for him.
//
// I think we have enough context to maybe build a good BI table...
// ===========================================================================


// ===========================================================================
// Now, let's send some clean, BI-ready data to our PowerBI DB.
//
// Our goal from here will be to transform the data in PC_STITCH_DB to a clean,
// BI-ready form inside of the POWERBI database for analytics consumption
// ===========================================================================
// let's create our spotify schema in the powerbi db
CREATE SCHEMA IF NOT EXISTS POWERBI.SPOTIFY;

// I think it would be fun to see the top 5 songs every
// month for US streaming.

// start by creating a temp table with stream counts for each song by month.
// We'll filter to only US data and we'll ignore the partial January 2018 data
CREATE OR REPLACE TEMPORARY TABLE
  POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
AS (
SELECT
  ARTIST,
  TRACK_NAME,
  SUM(STREAMS) AS STREAM_COUNT,
  DATE_TRUNC(MONTH, DATE) AS MONTH
FROM
  TABLE($SPOTIFY)
WHERE
  REGION='us' AND YEAR(DATE) = 2017
GROUP BY
  MONTH, ARTIST, TRACK_NAME
);

// now we'll get song ranks within each month using a nested
// query with a window function
CREATE OR REPLACE TABLE
    POWERBI.SPOTIFY.US_MONTHLY_TOP_5_SONGS
AS (
SELECT * FROM
(
    SELECT
      *,
      RANK() OVER (PARTITION BY MONTH ORDER BY STREAM_COUNT DESC) AS MONTHLY_RANK
    FROM
      POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
)
WHERE
  MONTHLY_RANK <= 5
);

// now that we're finished with all that messiness, let's grant read access
// on our new schema and table to our powerbi user
GRANT USAGE ON SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
GRANT SELECT ON ALL TABLES IN SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;


// Great job! You should be able to make some cool visuazliations in PBI now.
// ===========================================================================