Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2016
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.02 KB | None | 0 0
  1. # This script:
  2. # assigns a score to a labs propensity to buying
  3. # builds a logistic regression model to determine P[purchase for a new request]
  4.  
  5. ## Needs correction on prior estimation
  6. # (1) correction for lab if it enrolled recently: not needed since those requests will not be on vendor_order
  7. # (2)
  8. rm(list=ls())
  9. options(java.parameters = "-Xmx8000m")
  10.  
  11.  
  12. library(plyr)
  13. library(ggplot2)
  14. #library(reshape2)
  15. library(RMySQL)
  16. getwd()
  17. setwd('/Users/jwan/')
  18. setwd('Documents/myRscripts/Data')
  19.  
  20. # aggregate requests, offers, purchases for every group on weekly basis
  21. queryRanking <- "SELECT
  22. o.groups_id,
  23. #yearweek(rr.date_added) as YW,
  24. yearweek(vo.created_at) AS YW,
  25. ##CASE WHEN t.type_template_id < 14 THEN t.type_template_id ELSE 14 END AS myType,
  26. ##rr.order_source,
  27. #vb.vendor_user_id,
  28. count(DISTINCT vo.tbl_reagent_requests_id) AS requests,
  29. #count(DISTINCT vq.tbl_reagent_requests_id) AS requests,
  30. #count(DISTINCT vo.tbl_reagent_requests_id) as r2,
  31. count(DISTINCT pof.tbl_reagent_requests_id) AS Offers,
  32. count(DISTINCT CASE WHEN poi.status != 'CANCELLED' THEN poi.tbl_reagent_requests_id ELSE NULL END) AS purchaseItems
  33. FROM
  34. #tbl_reagent_requests rr
  35. #JOIN orders o ON o.tbl_reagent_requests_id = rr.req_id
  36. #JOIN vendor_order vo ON vo.tbl_reagent_requests_id = rr.req_id
  37. vendor_order vo JOIN orders o ON vo.tbl_reagent_requests_id = o.tbl_reagent_requests_id
  38. JOIN groups g ON g.id = o.groups_id
  39. #JOIN vendor_queue vq ON vo.tbl_reagent_requests_id = vq.tbl_reagent_requests_id
  40. #JOIN groups_attribute ga ON ga.groups_id = g.id
  41. JOIN tbl_universities_master z ON z.uni_id = g.university_id
  42. ##LEFT JOIN tbl_reagent_requests rr ON o.tbl_reagent_requests_id = rr.req_id
  43. ##LEFT JOIN types t ON rr.type_id = t.id
  44. #LEFT JOIN vendor_bid vb ON vq.id = vb.vendor_queue_id
  45. LEFT JOIN vendor_bid vb ON vo.tbl_reagent_requests_id = vb.tbl_reagent_requests_id
  46. LEFT JOIN purchase_offer_item pof ON vb.vendor_queue_id = pof.vendor_queue_id
  47. LEFT JOIN purchase_order_item poi ON vb.vendor_queue_id = poi.vendor_queue_id
  48. #LEFT JOIN (select groups_id,
  49. # count(DISTINCT CASE WHEN status != 'CANCELLED' THEN tbl_reagent_requests_id ELSE NULL END) as recentPurchases
  50. # FROM purchase_order_item
  51. # group by groups_id) A on A.groups_id = g.id AND A.created_at < rr.date_added AND yearweek(A.created_at) > yearweek(rr.date_added)-12
  52. WHERE
  53. vo.created_at >= '2014-08-04' AND
  54. yearweek(vo.created_at) < (yearweek(NOW())) AND
  55. #rr.date_added >= '2014-08-04' AND
  56. #yearweek(rr.date_added) < yearweek(NOW()) AND
  57. # rr.order_source != 'UPLOAD' AND
  58. # req_order_from != 2 AND req_old_order = 0 AND
  59. # req_delete_status IS NULL AND
  60. # rr.catalog_num != '' AND rr.catalog_num IS NOT NULL AND
  61. # length(rr.catalog_num) >= 3 AND
  62. # upper(rr.catalog_num) NOT LIKE 'NA' AND
  63. # upper(rr.catalog_num) NOT LIKE 'N/A' AND
  64. # z.country = 'US' AND
  65. (upper(z.uni_name) NOT LIKE '%QUARTZY%' AND upper(z.uni_name) NOT LIKE '%TEST%') #AND
  66. GROUP BY o.groups_id, YW; #vb.vendor_user_id, myType, rr.order_source,
  67. "
  68.  
  69. queryRankingFaster <- "#EXPLAIN
  70. SELECT
  71. o.groups_id,
  72. yearweek(vo.created_at) AS YW,
  73. count(DISTINCT vo.tbl_reagent_requests_id) AS requests,
  74. count(DISTINCT pof.tbl_reagent_requests_id) AS Offers,
  75. count(DISTINCT CASE WHEN poi.status != 'CANCELLED' THEN poi.tbl_reagent_requests_id ELSE NULL END) AS purchaseItems
  76. FROM
  77. vendor_order vo JOIN orders o ON vo.tbl_reagent_requests_id = o.tbl_reagent_requests_id
  78. JOIN groups g ON g.id = o.groups_id AND g.university_id != 6493
  79. JOIN groups_attribute ga ON g.id = ga.groups_id AND ga.quote_enroll = 'ON'
  80. #JOIN tbl_universities_master z ON z.uni_id = g.university_id
  81. LEFT JOIN vendor_bid vb ON vo.tbl_reagent_requests_id = vb.tbl_reagent_requests_id
  82. LEFT JOIN purchase_offer_item pof ON vb.vendor_queue_id = pof.vendor_queue_id
  83. LEFT JOIN purchase_order_item poi ON poi.tbl_reagent_requests_id = vb.tbl_reagent_requests_id #vb.vendor_queue_id = poi.vendor_queue_id
  84. #WHERE
  85. #vo.created_at >= '2015-01-01' #AND
  86. GROUP BY o.groups_id, YW;"
  87.  
  88. con <- dbConnect(MySQL(),user="quartzy_read", password="AqjjwE2D",dbname="quartzy", host="127.0.0.1", port=3307)
  89. on.exit(dbDisconnect(con))
  90.  
  91. df1 <- dbGetQuery(con, queryRankingFaster)
  92. #gpwk_temp <- dbGetQuery(con, queryPurchase)
  93. dbDisconnect(con)
  94. head(df1)
  95. summary(df1)
  96. df1 <- df1[complete.cases(df1),] # there should not be any NA. DB sometimes has created_at 0000-000-0000..
  97. df1 <- subset(df1,YW < max(df1$YW)) # faster to do this here than in the query since YEARWEEK() does not have index
  98.  
  99. df1$nYW <- substr(df1$YW,5,6)
  100. df1$nYW <- as.numeric(df1$nYW)
  101. df1$nYW[which(substr(df1$YW,3,4) == '15')] <- df1$nYW[which(substr(df1$YW,3,4) == '15')] + max(df1$nYW)
  102.  
  103. dfPrior <- ddply(df1, .(groups_id), summarize, requests = sum(requests), Offers = sum(Offers), purchaseItems = sum(purchaseItems))
  104. dfPrior$gpCVR <- dfPrior$purchaseItems/dfPrior$requests ## Used all data, not just three months data
  105. hist(dfPrior$gpCVR, breaks = c(seq(-0.01,0.3,by=0.02),0.6,1.01),freq = TRUE )
  106.  
  107. #### LME4 ####
  108. groups_id <- dfPrior[,'groups_id']
  109. dfMixedModel <- dfPrior[,c('purchaseItems','requests')]
  110. dfMixedModel <- as.matrix(dfMixedModel)
  111. library(lme4)
  112. m <- glmer(dfMixedModel ~ 1 + (1|groups_id), family=binomial, nAGQ=3)
  113. m
  114. a <- coef(m)$groups_id
  115. a$groups_id <- rownames(a)
  116. colnames(a) <- c('coeff','groups_id')
  117. str(a)
  118. a$groups_id <- as.numeric(a$groups_id)
  119. posteriorMixedModel <- merge(a,dfPrior[,c('groups_id','purchaseItems','requests')],by='groups_id')
  120. posteriorMixedModel$estimate <- (1 + exp(-posteriorMixedModel$coeff))^-1
  121. head(posteriorMixedModel)
  122. sum(posteriorMixedModel$purchaseItems)/sum(posteriorMixedModel$requests)
  123.  
  124. tmp <- cbind((1 + exp(-a$coeff))^-1, dfPrior$gpCVR, dfPrior$purchaseItems,dfPrior$requests,groups_id)
  125. head(tmp,50)
  126. (1 + exp(-fixef(m)))^-1
  127. str(subset(posteriorMixedModel,estimate < (1 + exp(-fixef(m)))^-1))
  128. str(subset(posteriorMixedModel,estimate > (1 + exp(-fixef(m)))^-1))
  129. quantile(posteriorMixedModel$purchaseItems/posteriorMixedModel$requests,seq(0.01,1,0.01))
  130. posteriorMixedModel <- rbind(posteriorMixedModel, c(0,fixef(m),0,0,(1 + exp(-fixef(m)))^-1))
  131.  
  132. result <- posteriorMixedModel[,c('groups_id','estimate')]
  133. colnames(result) <- c('groups_id','scores')
  134. write.csv(result,'result.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement