Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #THIS CODE WAS WRITTEN IN RSTUDIO#
- install.packages("RMySQL")
- library(RMySQL)
- mydb = dbConnect(MySQL(), user='abacus117', password='aJ4K@!lg3N2', dbname='smalldb', host='mysql4.gear.host')
- #dbDisconnect(mydb)
- dbListTables(mydb)
- A <- fetch(dbSendQuery(mydb, 'SELECT x.*
- FROM
- (SELECT b.store_name,
- CAST(b.store_id AS CHAR) store_id,
- count(DISTINCT CAST(a.order_id AS CHAR)) order_count
- FROM all_orders a
- JOIN user_stores b
- ON a.store_id = b.store_id
- JOIN user_info c
- ON b.user_id=c.user_id
- WHERE YEAR(a.processed_at) < 2014
- AND a.canceled_at IS NULL
- AND c.account_paused IS NULL
- GROUP BY b.store_name,
- b.store_id) x
- ORDER BY x.order_count DESC
- LIMIT 10;'))
- print(A)
- write.table(A, "C:/Users/Daniel/Documents/MailChimp - 1A - Output.csv", sep=",", row.names=F)
- B <- fetch(dbSendQuery(mydb,
- "SELECT b.store_name,
- CAST(b.store_id AS CHAR) store_id,
- c.country,
- SUM(SUBSTRING(a.order_total, 2) + 0) revenue
- FROM smalldb.all_orders a
- JOIN smalldb.user_stores b
- ON a.store_id = b.store_id
- JOIN smalldb.user_info c
- ON b.user_id=c.user_id
- WHERE c.country in ('US','CA')
- AND a.canceled_at IS NULL
- GROUP BY b.store_name,
- b.store_id,
- c.country
- ORDER BY b.store_name,
- b.store_id,
- c.country;"))
- print(B)
- US <- B[ which(B$country=='US'), ]
- nrow(US)
- print(US)
- CA <- B[ which(B$country=='CA'), ]
- nrow(CA)
- print(CA)
- outlier_funk <- function(dt, var) {
- var_name <- eval(substitute(var),eval(dt))
- na1 <- sum(is.na(var_name))
- m1 <- mean(var_name, na.rm = T)
- par(mfrow=c(2, 2), oma=c(0,0,3,0))
- boxplot(var_name, main="With outliers")
- hist(var_name, main="With outliers", xlab=NA, ylab=NA)
- outlier <- boxplot.stats(var_name)$out
- mo <- mean(outlier)
- var_name <- ifelse(var_name %in% outlier, NA, var_name)
- boxplot(var_name, main="Without outliers")
- hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
- title("Outlier Check", outer=TRUE)
- na2 <- sum(is.na(var_name))
- cat("Outliers identified:", na2 - na1, "n")
- cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
- cat("Mean of the outliers:", round(mo, 2), "n")
- m2 <- mean(var_name, na.rm = T)
- cat("Mean without removing outliers:", round(m1, 2), "n")
- cat("Mean if we remove outliers:", round(m2, 2), "n")
- response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
- if(response == "y" | response == "yes"){
- dt[as.character(substitute(var))] <- invisible(var_name)
- assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
- cat("Outliers successfully removed", "n")
- return(invisible(dt))
- } else{
- cat("Nothing changed", "n")
- return(invisible(var_name))
- }
- }
- outlier_funk(US, revenue)
- outlier_funk(CA,revenue)
- capture.output(print(summary(US$revenue)), file="C:/Users/Daniel/Documents/MailChimp - 2US - Output.csv")
- hist(US$revenue, main="US Revenues")
- capture.output(print(summary(CA$revenue)), file="C:/Users/Daniel/Documents/MailChimp - 2CA - Output.csv")
- hist(CA$revenue, main="CA Revenues")
- Users <- fetch(dbSendQuery(mydb,
- "SELECT
- c.country,
- COUNT(DISTINCT CAST(a.order_id AS CHAR)) orders,
- COUNT(DISTINCT CAST(c.user_id AS CHAR)) users,
- SUM(SUBSTRING(a.order_total, 2) + 0) revenue
- FROM smalldb.all_orders a
- JOIN smalldb.user_stores b
- ON a.store_id = b.store_id
- JOIN smalldb.user_info c
- ON b.user_id=c.user_id
- WHERE c.country in ('US','CA')
- AND a.canceled_at IS NULL
- AND c.account_paused IS NULL
- GROUP BY
- c.country;"))
- print(Users)
- C <- fetch(dbSendQuery(mydb,
- "SELECT x.*,
- (x.orders/x.stores) order_ratio,
- (x.revenue/x.stores) rev_store_ratio,
- (x.revenue/x.orders) rev_order_ratio
- FROM (
- SELECT
- c.country,
- COUNT(DISTINCT CAST(b.store_id AS CHAR)) stores,
- COUNT(DISTINCT CAST(a.order_id AS CHAR)) orders,
- SUM(SUBSTRING(a.order_total, 2) + 0) revenue
- FROM smalldb.all_orders a
- JOIN smalldb.user_stores b
- ON a.store_id = b.store_id
- JOIN smalldb.user_info c
- ON b.user_id=c.user_id
- WHERE c.country in ('US','CA')
- AND a.canceled_at IS NULL
- AND c.account_paused IS NULL
- GROUP BY
- c.country) x;"))
- print(C)
- US2 <- C[ which(C$country=='US'), ]
- nrow(US2)
- print(US2)
- CA2 <- C[ which(C$country=='CA'), ]
- nrow(CA2)
- print(CA2)
- D <- fetch(dbSendQuery(mydb,
- "SELECT b.store_name,
- CAST(b.store_id AS CHAR) store_id,
- c.country,
- COUNT(DISTINCT CAST(c.user_id AS CHAR)) users,
- COUNT(DISTINCT CAST(a.order_id AS CHAR)) orders,
- SUM(SUBSTRING(a.order_total, 2) + 0) revenue
- FROM smalldb.all_orders a
- JOIN smalldb.user_stores b
- ON a.store_id = b.store_id
- JOIN smalldb.user_info c
- ON b.user_id=c.user_id
- WHERE c.country in ('US','CA')
- AND a.canceled_at IS NULL
- AND c.account_paused IS NULL
- GROUP BY b.store_name,
- b.store_id,
- c.country
- ORDER BY b.store_name,
- b.store_id,
- c.country;"))
- print(D)
- US3 <- D[ which(D$country=='US'), ]
- nrow(US3)
- print(US3)
- CA3 <- D[ which(D$country=='CA'), ]
- nrow(CA3)
- print(CA3)
- summary(US3$revenue/US3$orders)
- hist(US3$revenue/US3$orders, main="US Revenue per Order")
- summary(CA3$revenue/CA3$orders)
- hist(CA3$revenue/CA3$orders, main="CA Revenue per Order")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement