daily pastebin goal
83%
SHARE
TWEET

dts2fst.R

lvalnegri May 16th, 2018 121 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ###############################################
  2. # SHINY Explorer - dt conversion to fst format
  3. ###############################################
  4.  
  5. #===== LOAD PACKAGES ------------------------------------------------------------------------------------------------------------
  6. library(fst)
  7. library(RMySQL)
  8. library(data.table)
  9.  
  10. #===== FUNCTIONS ----------------------------------------------------------------------------------------------------------------
  11. # recode multiple fields as ordered factors, using themselves or the lookup table as reference for the labels/levels
  12. recode.factors <-  function(dt, flds, lkps = FALSE){
  13.     for(fn in flds){
  14.         if(lkps){
  15.             lk <- sub('X', '', fn)
  16.             lvls <- lookups[domain_id == lk, lookup_id ]
  17.             lbls <- lookups[domain_id == lk, description ]
  18.         } else {
  19.             lvls <- sort(unique(dt[[fn]]))
  20.             lvls <- lvls[!(lvls %in% c(NA))]
  21.             lbls <- lvls
  22.         }
  23.         dt[, (fn) := factor(dt[[fn]], levels = lvls, labels = lbls, ordered = TRUE) ]
  24.     }
  25. }
  26.  
  27. #===== MAIN DATASET --------------------------------------------------------------------------------------------------------------
  28. # Load Data
  29. db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = audit)
  30. lookups <- suppressWarnings(data.table(dbGetQuery(db_conn, 'SELECT * FROM lookups WHERE domain_id != 9') ) )
  31. vars <- suppressWarnings(data.table(dbReadTable(db_conn, 'vars') ) )
  32. calendar <- suppressWarnings(data.table(dbReadTable(db_conn, 'calendar') ) )
  33. dt <- suppressWarnings(data.table(dbReadTable(db_conn, 'shinyexp_dataset'), key = 'HSP_id' ) )
  34. dbDisconnect(db_conn)
  35. db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = 'common')
  36. centres <- suppressWarnings(data.table(dbGetQuery(db_conn, paste('SELECT * FROM centres WHERE', audit)), key = 'HSP_id') )
  37. dbDisconnect(db_conn)
  38. # Add Hospitals info
  39. flds2update <- c('HSP_id', 'HSP', 'OA', 'CCG_id', 'CCG', 'LAT_id', 'LAT', 'NHSR_id', 'NHSR', 'CCR_id', 'CCR', 'CTRY_id', 'CTRY')
  40. dt <- dt[centres[, flds2update, with = FALSE] ]
  41. recode.factors(dt, flds2update)
  42. # Add Calendar info: date.xxx => date formats, datec.xxx => character/factor formats
  43. setkey(dt, 'datefield')
  44. dt <- calendar[, .(
  45.             datefield = DATEd,
  46.             date.day = as.Date(as.character(DATEd), '%Y%m%d'), datec.day = DATEd5, day_nid = dayOfMonth,
  47.             date.week = as.Date(DATEw5, '%d %b %y'), datec.week = DATEw5,
  48.             date.month = as.Date(paste0(DATEm, '01'), '%Y%m%d'), datec.month = DATEm1, daten.month = DATEm,
  49.             date.quarter = as.Date(
  50.                                 paste0(
  51.                                     substr(quartern, 1, 4),
  52.                                     ifelse(substr(quartern, 5, 5) == 4, '', '0'),
  53.                                     as.numeric(substr(quartern, 5, 5)) * 3,
  54.                                 '01'
  55.                             ), '%Y%m%d'
  56.                         ), datec.quarter = quarter,
  57.             date.year = year, datec.year = as.character(year)
  58. ), keyby = DATEd][dt][, DATEd := NULL]
  59. dt <- dt[!is.na(datefield)]
  60. dt[, month_id := as.numeric(format(date.day, "%m")) ]
  61.  
  62. flds2update <- c('datec.year', 'datec.quarter', 'datec.month', 'datec.week', 'datec.day')
  63. recode.factors(dt, flds2update)
  64.  
  65. dt[, day_lg := factor(day_id, levels = lookups[domain_id == 200091, lookup_id ], labels = lookups[domain_id == 200091, description ], ordered = TRUE )]
  66. dt[, day_st := factor(day_id, levels = lookups[domain_id == 200092, lookup_id ], labels = lookups[domain_id == 200092, description ], ordered = TRUE )]
  67. dt[, daypart := factor(daypart, levels = lookups[domain_id == 200095, lookup_id ], labels = lookups[domain_id == 200095, description ], ordered = TRUE )]
  68. dt[, month_lg := factor(month_id, levels = lookups[domain_id == 200093, lookup_id ], labels = lookups[domain_id == 200093, description ], ordered = TRUE )]
  69. dt[, month_st := factor(month_id, levels = lookups[domain_id == 200094, lookup_id ], labels = lookups[domain_id == 200094, description ], ordered = TRUE )]
  70. # Factors and labels LOGICAL (Yes-No) and CATEGORICALS
  71. flds2update <- vars[nature %in% c('CAT', 'LGC') & is_active == 1, lookup_id]
  72. recode.factors(dt, flds2update, TRUE)
  73.  
  74. #===== CONSULTANTS --------------------------------------------------------------------------------------------------------------
  75. if(has_consultants){
  76.     # Load audit specific
  77.     db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = audit)
  78.     pc <- suppressWarnings(data.table(dbGetQuery(db_conn, 'SELECT proc_id, consultant_id, responsibility FROM procedures_consultants WHERE is_valid'), key = 'proc_id') )
  79.     dbDisconnect(db_conn)
  80.     # Load common
  81.     db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = 'common')
  82.     cons <- suppressWarnings(data.table(dbReadTable(db_conn, 'consultants'), key = 'consultant_id' ) )
  83.     dbDisconnect(db_conn)
  84.     # Recode
  85.     cons[, gender := factor(gender, levels = c('M', 'W'), labels = c('Male', 'Female'))]
  86.     recode.factors(cons, 'qualification')
  87.     cons[, regStatus := factor(regStatus, levels = lookups[domain_id == 10010, lookup_id ], labels = lookups[domain_id == 10010, description ], ordered = TRUE )]
  88. }
  89.  
  90. #===== SAVE fts DATA ------------------------------------------------------------------------------------------------------------
  91. write.fst(dt, paste0(data.path, audit, '_dt.fst'), 100 )
  92.  
  93. #===== CLEAN AND EXIT -----------------------------------------------------------------------------------------------------------
  94. rm(list = ls())
  95. gc()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top