Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ####################################################
- #### Anomaly Finder ####
- #### by: Stefanie Molin ####
- #### ####
- ## Find anomalies in time series for accounts ##
- ## across several dimensions for diagnosis. ##
- ## Also check a specific campaign if provided. ##
- ####################################################
- #' Check Account (and Campaign) for Anomalies
- #'
- #' Find anomalies in time series for accounts across several dimensions for diagnosis. Also check a specific campaign if provided.
- #' @family Analytics & Queries
- #' @import AnomalyDetection
- #' @param username Your database username as a character string.
- #' @param client_identifier The client identifier of the client you want to run analysis for as an integer.
- #' @param campaign.id The campaign ID of the campaign you want to run analysis for as an integer. If NULL, only runs account level
- #' analysis.
- #' @param start The date you want to start your analysis formated as "YYYY-MM-DD".
- #' @param end The date you want to end your analysis formated as "YYYY-MM-DD".
- #' @param direction Either "both" for positive and negative anomalies or "pos" ("neg") for positive (negative) anomalies only.
- #' @param max.anomaly.percent The maximum percent (written as a decimal) of your data that can be considered an anomaly (must be
- #' less than 0.49).
- #' @param client.name The name of the client as a string to be printed to the graphs.
- #' @param campaign.name The name of the campaign as a string to be printed to campaign level graphs (if applicable).
- #' @param uses.dedup Boolean indicating whether your client provides you with deduplicated sales.
- #' @param password Character string for password. If left as NULL, RStudio will prompt for it.
- #'
- #' @return A list with results of anomaly detection analysis on total events, sales events, dedup ratio, client-level conversions,
- #' client-level clicks, and if applicable, campaign-level conversions and campaign-level clicks. Each includes a list of points
- #' that are anomalies and a graph for easy viewing.
- #'
- #' @note You need to provide at least 25 days of data for this analysis which uses Rosner's test.
- #' @note Anomalies are statistically significant at 95%.
- #' @note Underlying anomaly detection function is using piecewise median period of 2 weeks.
- #'
- #' @author Stefanie Molin \email{stefaniemolin@gmail.com}
- #'
- #' @examples
- #' \dontrun{
- #' results <- AnomalyCheck(username = "stefmolin", client_identifier = 12345, start = "2016-07-01", end = "2016-08-05",
- #' client.name = "My Client", uses.dedup = FALSE)
- #'
- #' # look into the client's sales events anomalies
- #' results$sales.evt.res
- #'
- #' # save sales events anomalies
- #' write.csv(results$sales.evt.res[["anoms"]], file = "My_Client_Sales_Event_Anomalies.csv")
- #'
- #' # save plot
- #' ggsave("My_Client_Sales_Events_Anomalies_Graph.png", plot = results$sales.evt.res[["plot"]])
- #' }
- #' @export
- AnomalyCheck <- function(username = "f.last", client_identifier, campaign.id = NULL, start = "YYYY-MM-DD", end = "YYYY-MM-DD",
- direction = "both", max.anomaly.percent = 0.1, client.name = NULL, campaign.name = NULL,
- uses.dedup = TRUE, password = NULL){
- require(AnomalyDetection)
- require(RJDBC)
- require(dplyr)
- require(packageToConnectToYourCompanyDB) # QueryYourDB() will come from this package
- # only ask for password once
- if(is.null(password)){
- password <- .rs.askForPassword("Database password:")
- }
- if(end == "YYYY-MM-DD"){
- end <- as.character(Sys.Date() - 1)
- }
- if(start == "YYYY-MM-DD"){
- stop("You must provide a start date.", call. = FALSE)
- }
- if(is.null(client_identifier)){
- stop("You must provide a client identifier.", call. = FALSE)
- }
- # client-level event check
- event_anomaly_query <- "select to_timestamp(event_ts) AS timestamp, COUNT(0) AS events
- from your_event_table
- where your_client_identifier = %s and day >= '%s' and day <= '%s' and persistent_user = TRUE
- GROUP BY event_ts
- ORDER BY event_ts"
- evt <- QueryYourDB(username = username, query = sprintf(event_anomaly_query, client_identifier, start, end),
- password = password)
- evt$timestamp <- as.POSIXlt(evt$timestamp, format = "%Y-%m-%d %H:%M:%S")
- evt$timestamp <- strftime(trunc(evt$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- evt <- as.data.frame(evt %>% group_by(timestamp) %>% summarise(Events = sum(events, na.rm = TRUE)))
- evt.res <- AnomalyDetectionTs(evt, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
- piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
- ylabel = "Event Count (Persistent User)", title = paste(client.name, "Events"))
- if(uses.dedup){
- # client-level dedup parameter check
- dedup_query1 <- "select to_timestamp(event_ts) AS timestamp, hour(to_timestamp(event_ts)) as hour,
- sum(case when dd ilike '%false%' then 1 else 0 end) as dd_false,
- sum(case when dd ilike '%true%' then 1 else 0 end) as dd_true, count(distinct transaction_id) as events
- from your_event_table
- where your_client_identifier ="
- dedup_query2 <- "and day >= '%s' and day <= '%s' and persistent_user = true and event_name_id = 6
- group by day(day), hour(to_timestamp(event_ts)), to_timestamp(event_ts)
- order by timestamp"
- dedup <- QueryYourDB(username = username, query = paste(dedup_query1, client_identifier, sprintf(dedup_query2, start, end)),
- password = password)
- dedup$timestamp <- as.POSIXlt(dedup$timestamp, format = "%Y-%m-%d %H:%M:%S")
- dedup$timestamp <- strftime(trunc(dedup$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- dedup <- as.data.frame(dedup %>% group_by(timestamp) %>% summarise(dedupFalse = sum(dd_false, na.rm = TRUE),
- dedupTrue = sum(dd_true, na.rm = TRUE)))
- dedup$otherRatio <- dedup$dedupFalse/(dedup$dedupFalse + dedup$dedupTrue)*100
- dedup$yourRatio <- dedup$dedupTrue/(dedup$dedupFalse + dedup$dedupTrue)*100
- dedup.res <- AnomalyDetectionTs(dedup[,c("timestamp", "yourRatio")], max_anoms = max.anomaly.percent,
- direction = direction, alpha = 0.05, piecewise_median_period_weeks = 2,
- plot = TRUE, xlabel = "Date", ylabel = "Dedup Ratio (You to Total) as %",
- title = paste(client.name, "Dedup Ratio"))
- } else{
- dedup.res = "Client doesn't use deduplication."
- }
- # client-level sales events
- sales_anomaly_query <- "select to_timestamp(event_ts) AS timestamp, COUNT(0) AS events from your_event_table
- where your_client_identifier = %s and day >= '%s' and day <= '%s' and event_name_id=6 and persistent_user = TRUE
- GROUP BY event_ts ORDER BY event_ts"
- sales.evt <- QueryYourDB(username = username, query = sprintf(sales_anomaly_query, client_identifier, start, end),
- password = password)
- sales.evt$timestamp <- as.POSIXlt(sales.evt$timestamp, format = "%Y-%m-%d %H:%M:%S")
- sales.evt$timestamp <- strftime(trunc(sales.evt$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- sales.evt <- as.data.frame(sales.evt %>% group_by(timestamp) %>% summarise(Events = sum(events, na.rm = TRUE)))
- sales.evt.res <- AnomalyDetectionTs(sales.evt, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
- piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
- ylabel = "Sales Event Count (Persistent User)",
- title = paste(client.name, "Sales Events"))
- # client-level clicks
- click_anomaly_query <- "select to_timestamp(click_ts) AS timestamp, COUNT(0) AS clicks from your_clicks_data
- where your_client_identifier = %s and day >= '%s' and day <= '%s' GROUP BY click_ts ORDER BY timestamp"
- click <- QueryYourDB(username = username, query = sprintf(click_anomaly_query, client_identifier, start, end),
- password = password)
- click$timestamp <- as.POSIXlt(click$timestamp, format = "%Y-%m-%d %H:%M:%S")
- click$timestamp <- strftime(trunc(click$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- click <- as.data.frame(click %>% group_by(timestamp) %>% summarise(Clicks = sum(clicks, na.rm = TRUE)))
- click.res <- AnomalyDetectionTs(click, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
- piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
- ylabel = "Click Count", title = paste(client.name, "Clicks"))
- # client-level conversions
- if(uses.dedup){
- conversion_query <- "select to_timestamp(trans_timestamp) AS timestamp, COUNT(DISTINCT transaction_id) AS conversions
- from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
- and deduplication_matching = 1 and attribution_type = 'pc' GROUP BY trans_timestamp ORDER BY timestamp"
- } else{
- conversion_query <- "select to_timestamp(trans_timestamp) AS timestamp, COUNT(DISTINCT transaction_id) AS conversions
- from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
- and attribution_type = 'pc' GROUP BY trans_timestamp ORDER BY timestamp"
- }
- conv <- QueryYourDB(username = username, query = sprintf(conversion_query, client_identifier, start, end),
- password = password)
- conv$timestamp <- as.POSIXlt(conv$timestamp, format = "%Y-%m-%d %H:%M:%S")
- conv$timestamp <- strftime(trunc(conv$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- conv <- as.data.frame(conv %>% group_by(timestamp) %>% summarise(Conversions = sum(conversions, na.rm = TRUE)))
- conv.res <- AnomalyDetectionTs(conv, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
- piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
- ylabel = "Conversion Count", title = paste(client.name, "Conversions"))
- # if campaign id is null return a list of all client-level results else run additional campaign level
- if(is.null(campaign.id)){
- resultsList <- list(events = evt.res, dedupRatio = dedup.res, salesEvents = sales.evt.res, clicks = click.res,
- conversions = conv.res)
- } else{
- # if campaign.id isn't NULL run additional analysis on the campaign indicated for clicks and conversions
- # campaign-level clicks
- click_camp_anomaly_query <- "select to_timestamp(click_ts) AS timestamp, COUNT(0) AS clicks from your_clicks_data
- where your_client_identifier = %s and day >= '%s' and day <= '%s' and campaign_id = %s GROUP BY click_ts ORDER BY timestamp"
- click.camp <- QueryYourDB(username = username, query = sprintf(click_camp_anomaly_query, client_identifier, start,
- end, campaign.id), password = password)
- click.camp$timestamp <- as.POSIXlt(click.camp$timestamp, format = "%Y-%m-%d %H:%M:%S")
- click.camp$timestamp <- strftime(trunc(click.camp$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- click.camp <- as.data.frame(click.camp %>% group_by(timestamp) %>% summarise(Clicks = sum(clicks, na.rm = TRUE)))
- click.camp.res <- AnomalyDetectionTs(click.camp, max_anoms = max.anomaly.percent, direction = direction,
- alpha = 0.05, piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
- ylabel = "Click Count", title = paste(client.name, campaign.name, "Clicks"))
- if(uses.dedup){
- conversion_camp_query <- "select to_timestamp(trans_timestamp) AS timestamp,
- COUNT(DISTINCT transaction_id) AS conversions
- from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
- and campaign_id = %s and deduplication_matching = 1 and attribution_type = 'pc'
- GROUP BY trans_timestamp ORDER BY timestamp"
- } else{
- conversion_camp_query <- "select to_timestamp(trans_timestamp) AS timestamp,
- COUNT(DISTINCT transaction_id) AS conversions
- from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
- and campaign_id = %s and attribution_type = 'pc' GROUP BY trans_timestamp ORDER BY timestamp"
- }
- conv.camp <- QueryYourDB(username = username, query = sprintf(conversion_query, client_identifier, start, end,
- campaign.id), password = password)
- conv.camp$timestamp <- as.POSIXlt(conv.camp$timestamp, format = "%Y-%m-%d %H:%M:%S")
- conv.camp$timestamp <- strftime(trunc(conv.camp$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
- conv.camp <- as.data.frame(conv.camp %>% group_by(timestamp) %>% summarise(Conversions = sum(conversions,
- na.rm = TRUE)))
- conv.camp.res <- AnomalyDetectionTs(conv.camp, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
- piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
- ylabel = "Conversion Count", title = paste(client.name, "Conversions"))
- resultsList <- list(events = evt.res, dedupRatio = dedup.res, salesEvents = sales.evt.res,
- yourClientClicks = click.res, yourClientConversions = conv.res,
- campaignClicks = click.camp.res, campaignConversions = conv.camp.res)
- }
- return(resultsList)
- }
Add Comment
Please, Sign In to add comment