Guest User

Untitled

a guest
Mar 20th, 2018
306
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.48 KB | None | 0 0
  1. ####################################################
  2. #### Anomaly Finder ####
  3. #### by: Stefanie Molin ####
  4. #### ####
  5. ## Find anomalies in time series for accounts ##
  6. ## across several dimensions for diagnosis. ##
  7. ## Also check a specific campaign if provided. ##
  8. ####################################################
  9.  
  10. #' Check Account (and Campaign) for Anomalies
  11. #'
  12. #' Find anomalies in time series for accounts across several dimensions for diagnosis. Also check a specific campaign if provided.
  13. #' @family Analytics & Queries
  14. #' @import AnomalyDetection
  15. #' @param username Your database username as a character string.
  16. #' @param client_identifier The client identifier of the client you want to run analysis for as an integer.
  17. #' @param campaign.id The campaign ID of the campaign you want to run analysis for as an integer. If NULL, only runs account level
  18. #' analysis.
  19. #' @param start The date you want to start your analysis formated as "YYYY-MM-DD".
  20. #' @param end The date you want to end your analysis formated as "YYYY-MM-DD".
  21. #' @param direction Either "both" for positive and negative anomalies or "pos" ("neg") for positive (negative) anomalies only.
  22. #' @param max.anomaly.percent The maximum percent (written as a decimal) of your data that can be considered an anomaly (must be
  23. #' less than 0.49).
  24. #' @param client.name The name of the client as a string to be printed to the graphs.
  25. #' @param campaign.name The name of the campaign as a string to be printed to campaign level graphs (if applicable).
  26. #' @param uses.dedup Boolean indicating whether your client provides you with deduplicated sales.
  27. #' @param password Character string for password. If left as NULL, RStudio will prompt for it.
  28. #'
  29. #' @return A list with results of anomaly detection analysis on total events, sales events, dedup ratio, client-level conversions,
  30. #' client-level clicks, and if applicable, campaign-level conversions and campaign-level clicks. Each includes a list of points
  31. #' that are anomalies and a graph for easy viewing.
  32. #'
  33. #' @note You need to provide at least 25 days of data for this analysis which uses Rosner's test.
  34. #' @note Anomalies are statistically significant at 95%.
  35. #' @note Underlying anomaly detection function is using piecewise median period of 2 weeks.
  36. #'
  37. #' @author Stefanie Molin \email{stefaniemolin@gmail.com}
  38. #'
  39. #' @examples
  40. #' \dontrun{
  41. #' results <- AnomalyCheck(username = "stefmolin", client_identifier = 12345, start = "2016-07-01", end = "2016-08-05",
  42. #' client.name = "My Client", uses.dedup = FALSE)
  43. #'
  44. #' # look into the client's sales events anomalies
  45. #' results$sales.evt.res
  46. #'
  47. #' # save sales events anomalies
  48. #' write.csv(results$sales.evt.res[["anoms"]], file = "My_Client_Sales_Event_Anomalies.csv")
  49. #'
  50. #' # save plot
  51. #' ggsave("My_Client_Sales_Events_Anomalies_Graph.png", plot = results$sales.evt.res[["plot"]])
  52. #' }
  53. #' @export
  54.  
  55. AnomalyCheck <- function(username = "f.last", client_identifier, campaign.id = NULL, start = "YYYY-MM-DD", end = "YYYY-MM-DD",
  56. direction = "both", max.anomaly.percent = 0.1, client.name = NULL, campaign.name = NULL,
  57. uses.dedup = TRUE, password = NULL){
  58. require(AnomalyDetection)
  59. require(RJDBC)
  60. require(dplyr)
  61. require(packageToConnectToYourCompanyDB) # QueryYourDB() will come from this package
  62.  
  63. # only ask for password once
  64. if(is.null(password)){
  65. password <- .rs.askForPassword("Database password:")
  66. }
  67.  
  68. if(end == "YYYY-MM-DD"){
  69. end <- as.character(Sys.Date() - 1)
  70. }
  71.  
  72. if(start == "YYYY-MM-DD"){
  73. stop("You must provide a start date.", call. = FALSE)
  74. }
  75.  
  76. if(is.null(client_identifier)){
  77. stop("You must provide a client identifier.", call. = FALSE)
  78. }
  79.  
  80. # client-level event check
  81. event_anomaly_query <- "select to_timestamp(event_ts) AS timestamp, COUNT(0) AS events
  82. from your_event_table
  83. where your_client_identifier = %s and day >= '%s' and day <= '%s' and persistent_user = TRUE
  84. GROUP BY event_ts
  85. ORDER BY event_ts"
  86. evt <- QueryYourDB(username = username, query = sprintf(event_anomaly_query, client_identifier, start, end),
  87. password = password)
  88. evt$timestamp <- as.POSIXlt(evt$timestamp, format = "%Y-%m-%d %H:%M:%S")
  89. evt$timestamp <- strftime(trunc(evt$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  90. evt <- as.data.frame(evt %>% group_by(timestamp) %>% summarise(Events = sum(events, na.rm = TRUE)))
  91. evt.res <- AnomalyDetectionTs(evt, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
  92. piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
  93. ylabel = "Event Count (Persistent User)", title = paste(client.name, "Events"))
  94.  
  95. if(uses.dedup){
  96. # client-level dedup parameter check
  97. dedup_query1 <- "select to_timestamp(event_ts) AS timestamp, hour(to_timestamp(event_ts)) as hour,
  98. sum(case when dd ilike '%false%' then 1 else 0 end) as dd_false,
  99. sum(case when dd ilike '%true%' then 1 else 0 end) as dd_true, count(distinct transaction_id) as events
  100. from your_event_table
  101. where your_client_identifier ="
  102. dedup_query2 <- "and day >= '%s' and day <= '%s' and persistent_user = true and event_name_id = 6
  103. group by day(day), hour(to_timestamp(event_ts)), to_timestamp(event_ts)
  104. order by timestamp"
  105. dedup <- QueryYourDB(username = username, query = paste(dedup_query1, client_identifier, sprintf(dedup_query2, start, end)),
  106. password = password)
  107. dedup$timestamp <- as.POSIXlt(dedup$timestamp, format = "%Y-%m-%d %H:%M:%S")
  108. dedup$timestamp <- strftime(trunc(dedup$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  109. dedup <- as.data.frame(dedup %>% group_by(timestamp) %>% summarise(dedupFalse = sum(dd_false, na.rm = TRUE),
  110. dedupTrue = sum(dd_true, na.rm = TRUE)))
  111. dedup$otherRatio <- dedup$dedupFalse/(dedup$dedupFalse + dedup$dedupTrue)*100
  112. dedup$yourRatio <- dedup$dedupTrue/(dedup$dedupFalse + dedup$dedupTrue)*100
  113.  
  114. dedup.res <- AnomalyDetectionTs(dedup[,c("timestamp", "yourRatio")], max_anoms = max.anomaly.percent,
  115. direction = direction, alpha = 0.05, piecewise_median_period_weeks = 2,
  116. plot = TRUE, xlabel = "Date", ylabel = "Dedup Ratio (You to Total) as %",
  117. title = paste(client.name, "Dedup Ratio"))
  118. } else{
  119. dedup.res = "Client doesn't use deduplication."
  120. }
  121.  
  122.  
  123. # client-level sales events
  124. sales_anomaly_query <- "select to_timestamp(event_ts) AS timestamp, COUNT(0) AS events from your_event_table
  125. where your_client_identifier = %s and day >= '%s' and day <= '%s' and event_name_id=6 and persistent_user = TRUE
  126. GROUP BY event_ts ORDER BY event_ts"
  127. sales.evt <- QueryYourDB(username = username, query = sprintf(sales_anomaly_query, client_identifier, start, end),
  128. password = password)
  129. sales.evt$timestamp <- as.POSIXlt(sales.evt$timestamp, format = "%Y-%m-%d %H:%M:%S")
  130. sales.evt$timestamp <- strftime(trunc(sales.evt$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  131. sales.evt <- as.data.frame(sales.evt %>% group_by(timestamp) %>% summarise(Events = sum(events, na.rm = TRUE)))
  132. sales.evt.res <- AnomalyDetectionTs(sales.evt, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
  133. piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
  134. ylabel = "Sales Event Count (Persistent User)",
  135. title = paste(client.name, "Sales Events"))
  136.  
  137. # client-level clicks
  138. click_anomaly_query <- "select to_timestamp(click_ts) AS timestamp, COUNT(0) AS clicks from your_clicks_data
  139. where your_client_identifier = %s and day >= '%s' and day <= '%s' GROUP BY click_ts ORDER BY timestamp"
  140. click <- QueryYourDB(username = username, query = sprintf(click_anomaly_query, client_identifier, start, end),
  141. password = password)
  142. click$timestamp <- as.POSIXlt(click$timestamp, format = "%Y-%m-%d %H:%M:%S")
  143. click$timestamp <- strftime(trunc(click$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  144. click <- as.data.frame(click %>% group_by(timestamp) %>% summarise(Clicks = sum(clicks, na.rm = TRUE)))
  145. click.res <- AnomalyDetectionTs(click, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
  146. piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
  147. ylabel = "Click Count", title = paste(client.name, "Clicks"))
  148.  
  149. # client-level conversions
  150. if(uses.dedup){
  151. conversion_query <- "select to_timestamp(trans_timestamp) AS timestamp, COUNT(DISTINCT transaction_id) AS conversions
  152. from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
  153. and deduplication_matching = 1 and attribution_type = 'pc' GROUP BY trans_timestamp ORDER BY timestamp"
  154. } else{
  155. conversion_query <- "select to_timestamp(trans_timestamp) AS timestamp, COUNT(DISTINCT transaction_id) AS conversions
  156. from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
  157. and attribution_type = 'pc' GROUP BY trans_timestamp ORDER BY timestamp"
  158. }
  159. conv <- QueryYourDB(username = username, query = sprintf(conversion_query, client_identifier, start, end),
  160. password = password)
  161. conv$timestamp <- as.POSIXlt(conv$timestamp, format = "%Y-%m-%d %H:%M:%S")
  162. conv$timestamp <- strftime(trunc(conv$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  163. conv <- as.data.frame(conv %>% group_by(timestamp) %>% summarise(Conversions = sum(conversions, na.rm = TRUE)))
  164. conv.res <- AnomalyDetectionTs(conv, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
  165. piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
  166. ylabel = "Conversion Count", title = paste(client.name, "Conversions"))
  167.  
  168. # if campaign id is null return a list of all client-level results else run additional campaign level
  169. if(is.null(campaign.id)){
  170. resultsList <- list(events = evt.res, dedupRatio = dedup.res, salesEvents = sales.evt.res, clicks = click.res,
  171. conversions = conv.res)
  172. } else{
  173. # if campaign.id isn't NULL run additional analysis on the campaign indicated for clicks and conversions
  174.  
  175. # campaign-level clicks
  176. click_camp_anomaly_query <- "select to_timestamp(click_ts) AS timestamp, COUNT(0) AS clicks from your_clicks_data
  177. where your_client_identifier = %s and day >= '%s' and day <= '%s' and campaign_id = %s GROUP BY click_ts ORDER BY timestamp"
  178. click.camp <- QueryYourDB(username = username, query = sprintf(click_camp_anomaly_query, client_identifier, start,
  179. end, campaign.id), password = password)
  180. click.camp$timestamp <- as.POSIXlt(click.camp$timestamp, format = "%Y-%m-%d %H:%M:%S")
  181. click.camp$timestamp <- strftime(trunc(click.camp$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  182. click.camp <- as.data.frame(click.camp %>% group_by(timestamp) %>% summarise(Clicks = sum(clicks, na.rm = TRUE)))
  183. click.camp.res <- AnomalyDetectionTs(click.camp, max_anoms = max.anomaly.percent, direction = direction,
  184. alpha = 0.05, piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
  185. ylabel = "Click Count", title = paste(client.name, campaign.name, "Clicks"))
  186.  
  187. if(uses.dedup){
  188. conversion_camp_query <- "select to_timestamp(trans_timestamp) AS timestamp,
  189. COUNT(DISTINCT transaction_id) AS conversions
  190. from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
  191. and campaign_id = %s and deduplication_matching = 1 and attribution_type = 'pc'
  192. GROUP BY trans_timestamp ORDER BY timestamp"
  193. } else{
  194. conversion_camp_query <- "select to_timestamp(trans_timestamp) AS timestamp,
  195. COUNT(DISTINCT transaction_id) AS conversions
  196. from your_conversions_table where your_client_identifier = %s and day >= '%s' and day <= '%s'
  197. and campaign_id = %s and attribution_type = 'pc' GROUP BY trans_timestamp ORDER BY timestamp"
  198. }
  199. conv.camp <- QueryYourDB(username = username, query = sprintf(conversion_query, client_identifier, start, end,
  200. campaign.id), password = password)
  201. conv.camp$timestamp <- as.POSIXlt(conv.camp$timestamp, format = "%Y-%m-%d %H:%M:%S")
  202. conv.camp$timestamp <- strftime(trunc(conv.camp$timestamp, "day"), format = "%Y-%m-%d %H:%M:%S")
  203. conv.camp <- as.data.frame(conv.camp %>% group_by(timestamp) %>% summarise(Conversions = sum(conversions,
  204. na.rm = TRUE)))
  205. conv.camp.res <- AnomalyDetectionTs(conv.camp, max_anoms = max.anomaly.percent, direction = direction, alpha = 0.05,
  206. piecewise_median_period_weeks = 2, plot = TRUE, xlabel = "Date",
  207. ylabel = "Conversion Count", title = paste(client.name, "Conversions"))
  208.  
  209. resultsList <- list(events = evt.res, dedupRatio = dedup.res, salesEvents = sales.evt.res,
  210. yourClientClicks = click.res, yourClientConversions = conv.res,
  211. campaignClicks = click.camp.res, campaignConversions = conv.camp.res)
  212. }
  213. return(resultsList)
  214. }
Add Comment
Please, Sign In to add comment