Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # This script:
- # assigns a score to a labs propensity to buying
- # builds a logistic regression model to determine P[purchase for a new request]
- ## Needs correction on prior estimation
- # (1) correction for lab if it enrolled recently: not needed since those requests will not be on vendor_order
- # (2)
- rm(list=ls())
- options(java.parameters = "-Xmx8000m")
- library(plyr)
- library(ggplot2)
- #library(reshape2)
- library(RMySQL)
- getwd()
- setwd('/Users/jwan/')
- setwd('Documents/myRscripts/Data')
- # aggregate requests, offers, purchases for every group on weekly basis
- queryRanking <- "SELECT
- o.groups_id,
- #yearweek(rr.date_added) as YW,
- yearweek(vo.created_at) AS YW,
- ##CASE WHEN t.type_template_id < 14 THEN t.type_template_id ELSE 14 END AS myType,
- ##rr.order_source,
- #vb.vendor_user_id,
- count(DISTINCT vo.tbl_reagent_requests_id) AS requests,
- #count(DISTINCT vq.tbl_reagent_requests_id) AS requests,
- #count(DISTINCT vo.tbl_reagent_requests_id) as r2,
- count(DISTINCT pof.tbl_reagent_requests_id) AS Offers,
- count(DISTINCT CASE WHEN poi.status != 'CANCELLED' THEN poi.tbl_reagent_requests_id ELSE NULL END) AS purchaseItems
- FROM
- #tbl_reagent_requests rr
- #JOIN orders o ON o.tbl_reagent_requests_id = rr.req_id
- #JOIN vendor_order vo ON vo.tbl_reagent_requests_id = rr.req_id
- vendor_order vo JOIN orders o ON vo.tbl_reagent_requests_id = o.tbl_reagent_requests_id
- JOIN groups g ON g.id = o.groups_id
- #JOIN vendor_queue vq ON vo.tbl_reagent_requests_id = vq.tbl_reagent_requests_id
- #JOIN groups_attribute ga ON ga.groups_id = g.id
- JOIN tbl_universities_master z ON z.uni_id = g.university_id
- ##LEFT JOIN tbl_reagent_requests rr ON o.tbl_reagent_requests_id = rr.req_id
- ##LEFT JOIN types t ON rr.type_id = t.id
- #LEFT JOIN vendor_bid vb ON vq.id = vb.vendor_queue_id
- LEFT JOIN vendor_bid vb ON vo.tbl_reagent_requests_id = vb.tbl_reagent_requests_id
- LEFT JOIN purchase_offer_item pof ON vb.vendor_queue_id = pof.vendor_queue_id
- LEFT JOIN purchase_order_item poi ON vb.vendor_queue_id = poi.vendor_queue_id
- #LEFT JOIN (select groups_id,
- # count(DISTINCT CASE WHEN status != 'CANCELLED' THEN tbl_reagent_requests_id ELSE NULL END) as recentPurchases
- # FROM purchase_order_item
- # group by groups_id) A on A.groups_id = g.id AND A.created_at < rr.date_added AND yearweek(A.created_at) > yearweek(rr.date_added)-12
- WHERE
- vo.created_at >= '2014-08-04' AND
- yearweek(vo.created_at) < (yearweek(NOW())) AND
- #rr.date_added >= '2014-08-04' AND
- #yearweek(rr.date_added) < yearweek(NOW()) AND
- # rr.order_source != 'UPLOAD' AND
- # req_order_from != 2 AND req_old_order = 0 AND
- # req_delete_status IS NULL AND
- # rr.catalog_num != '' AND rr.catalog_num IS NOT NULL AND
- # length(rr.catalog_num) >= 3 AND
- # upper(rr.catalog_num) NOT LIKE 'NA' AND
- # upper(rr.catalog_num) NOT LIKE 'N/A' AND
- # z.country = 'US' AND
- (upper(z.uni_name) NOT LIKE '%QUARTZY%' AND upper(z.uni_name) NOT LIKE '%TEST%') #AND
- GROUP BY o.groups_id, YW; #vb.vendor_user_id, myType, rr.order_source,
- "
- queryRankingFaster <- "#EXPLAIN
- SELECT
- o.groups_id,
- yearweek(vo.created_at) AS YW,
- count(DISTINCT vo.tbl_reagent_requests_id) AS requests,
- count(DISTINCT pof.tbl_reagent_requests_id) AS Offers,
- count(DISTINCT CASE WHEN poi.status != 'CANCELLED' THEN poi.tbl_reagent_requests_id ELSE NULL END) AS purchaseItems
- FROM
- vendor_order vo JOIN orders o ON vo.tbl_reagent_requests_id = o.tbl_reagent_requests_id
- JOIN groups g ON g.id = o.groups_id AND g.university_id != 6493
- JOIN groups_attribute ga ON g.id = ga.groups_id AND ga.quote_enroll = 'ON'
- #JOIN tbl_universities_master z ON z.uni_id = g.university_id
- LEFT JOIN vendor_bid vb ON vo.tbl_reagent_requests_id = vb.tbl_reagent_requests_id
- LEFT JOIN purchase_offer_item pof ON vb.vendor_queue_id = pof.vendor_queue_id
- LEFT JOIN purchase_order_item poi ON poi.tbl_reagent_requests_id = vb.tbl_reagent_requests_id #vb.vendor_queue_id = poi.vendor_queue_id
- #WHERE
- #vo.created_at >= '2015-01-01' #AND
- GROUP BY o.groups_id, YW;"
- con <- dbConnect(MySQL(),user="quartzy_read", password="AqjjwE2D",dbname="quartzy", host="127.0.0.1", port=3307)
- on.exit(dbDisconnect(con))
- df1 <- dbGetQuery(con, queryRankingFaster)
- #gpwk_temp <- dbGetQuery(con, queryPurchase)
- dbDisconnect(con)
- head(df1)
- summary(df1)
- df1 <- df1[complete.cases(df1),] # there should not be any NA. DB sometimes has created_at 0000-000-0000..
- df1 <- subset(df1,YW < max(df1$YW)) # faster to do this here than in the query since YEARWEEK() does not have index
- df1$nYW <- substr(df1$YW,5,6)
- df1$nYW <- as.numeric(df1$nYW)
- df1$nYW[which(substr(df1$YW,3,4) == '15')] <- df1$nYW[which(substr(df1$YW,3,4) == '15')] + max(df1$nYW)
- dfPrior <- ddply(df1, .(groups_id), summarize, requests = sum(requests), Offers = sum(Offers), purchaseItems = sum(purchaseItems))
- dfPrior$gpCVR <- dfPrior$purchaseItems/dfPrior$requests ## Used all data, not just three months data
- hist(dfPrior$gpCVR, breaks = c(seq(-0.01,0.3,by=0.02),0.6,1.01),freq = TRUE )
- #### LME4 ####
- groups_id <- dfPrior[,'groups_id']
- dfMixedModel <- dfPrior[,c('purchaseItems','requests')]
- dfMixedModel <- as.matrix(dfMixedModel)
- library(lme4)
- m <- glmer(dfMixedModel ~ 1 + (1|groups_id), family=binomial, nAGQ=3)
- m
- a <- coef(m)$groups_id
- a$groups_id <- rownames(a)
- colnames(a) <- c('coeff','groups_id')
- str(a)
- a$groups_id <- as.numeric(a$groups_id)
- posteriorMixedModel <- merge(a,dfPrior[,c('groups_id','purchaseItems','requests')],by='groups_id')
- posteriorMixedModel$estimate <- (1 + exp(-posteriorMixedModel$coeff))^-1
- head(posteriorMixedModel)
- sum(posteriorMixedModel$purchaseItems)/sum(posteriorMixedModel$requests)
- tmp <- cbind((1 + exp(-a$coeff))^-1, dfPrior$gpCVR, dfPrior$purchaseItems,dfPrior$requests,groups_id)
- head(tmp,50)
- (1 + exp(-fixef(m)))^-1
- str(subset(posteriorMixedModel,estimate < (1 + exp(-fixef(m)))^-1))
- str(subset(posteriorMixedModel,estimate > (1 + exp(-fixef(m)))^-1))
- quantile(posteriorMixedModel$purchaseItems/posteriorMixedModel$requests,seq(0.01,1,0.01))
- posteriorMixedModel <- rbind(posteriorMixedModel, c(0,fixef(m),0,0,(1 + exp(-fixef(m)))^-1))
- result <- posteriorMixedModel[,c('groups_id','estimate')]
- colnames(result) <- c('groups_id','scores')
- write.csv(result,'result.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement