Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###############################################
- # SHINY Explorer - dt conversion to fst format
- ###############################################
- #===== LOAD PACKAGES ------------------------------------------------------------------------------------------------------------
- library(fst)
- library(RMySQL)
- library(data.table)
- #===== FUNCTIONS ----------------------------------------------------------------------------------------------------------------
- # recode multiple fields as ordered factors, using themselves or the lookup table as reference for the labels/levels
- recode.factors <- function(dt, flds, lkps = FALSE){
- for(fn in flds){
- if(lkps){
- lk <- sub('X', '', fn)
- lvls <- lookups[domain_id == lk, lookup_id ]
- lbls <- lookups[domain_id == lk, description ]
- } else {
- lvls <- sort(unique(dt[[fn]]))
- lvls <- lvls[!(lvls %in% c(NA))]
- lbls <- lvls
- }
- dt[, (fn) := factor(dt[[fn]], levels = lvls, labels = lbls, ordered = TRUE) ]
- }
- }
- #===== MAIN DATASET --------------------------------------------------------------------------------------------------------------
- # Load Data
- db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = audit)
- lookups <- suppressWarnings(data.table(dbGetQuery(db_conn, 'SELECT * FROM lookups WHERE domain_id != 9') ) )
- vars <- suppressWarnings(data.table(dbReadTable(db_conn, 'vars') ) )
- calendar <- suppressWarnings(data.table(dbReadTable(db_conn, 'calendar') ) )
- dt <- suppressWarnings(data.table(dbReadTable(db_conn, 'shinyexp_dataset'), key = 'HSP_id' ) )
- dbDisconnect(db_conn)
- db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = 'common')
- centres <- suppressWarnings(data.table(dbGetQuery(db_conn, paste('SELECT * FROM centres WHERE', audit)), key = 'HSP_id') )
- dbDisconnect(db_conn)
- # Add Hospitals info
- flds2update <- c('HSP_id', 'HSP', 'OA', 'CCG_id', 'CCG', 'LAT_id', 'LAT', 'NHSR_id', 'NHSR', 'CCR_id', 'CCR', 'CTRY_id', 'CTRY')
- dt <- dt[centres[, flds2update, with = FALSE] ]
- recode.factors(dt, flds2update)
- # Add Calendar info: date.xxx => date formats, datec.xxx => character/factor formats
- setkey(dt, 'datefield')
- dt <- calendar[, .(
- datefield = DATEd,
- date.day = as.Date(as.character(DATEd), '%Y%m%d'), datec.day = DATEd5, day_nid = dayOfMonth,
- date.week = as.Date(DATEw5, '%d %b %y'), datec.week = DATEw5,
- date.month = as.Date(paste0(DATEm, '01'), '%Y%m%d'), datec.month = DATEm1, daten.month = DATEm,
- date.quarter = as.Date(
- paste0(
- substr(quartern, 1, 4),
- ifelse(substr(quartern, 5, 5) == 4, '', '0'),
- as.numeric(substr(quartern, 5, 5)) * 3,
- '01'
- ), '%Y%m%d'
- ), datec.quarter = quarter,
- date.year = year, datec.year = as.character(year)
- ), keyby = DATEd][dt][, DATEd := NULL]
- dt <- dt[!is.na(datefield)]
- dt[, month_id := as.numeric(format(date.day, "%m")) ]
- flds2update <- c('datec.year', 'datec.quarter', 'datec.month', 'datec.week', 'datec.day')
- recode.factors(dt, flds2update)
- dt[, day_lg := factor(day_id, levels = lookups[domain_id == 200091, lookup_id ], labels = lookups[domain_id == 200091, description ], ordered = TRUE )]
- dt[, day_st := factor(day_id, levels = lookups[domain_id == 200092, lookup_id ], labels = lookups[domain_id == 200092, description ], ordered = TRUE )]
- dt[, daypart := factor(daypart, levels = lookups[domain_id == 200095, lookup_id ], labels = lookups[domain_id == 200095, description ], ordered = TRUE )]
- dt[, month_lg := factor(month_id, levels = lookups[domain_id == 200093, lookup_id ], labels = lookups[domain_id == 200093, description ], ordered = TRUE )]
- dt[, month_st := factor(month_id, levels = lookups[domain_id == 200094, lookup_id ], labels = lookups[domain_id == 200094, description ], ordered = TRUE )]
- # Factors and labels LOGICAL (Yes-No) and CATEGORICALS
- flds2update <- vars[nature %in% c('CAT', 'LGC') & is_active == 1, lookup_id]
- recode.factors(dt, flds2update, TRUE)
- #===== CONSULTANTS --------------------------------------------------------------------------------------------------------------
- if(has_consultants){
- # Load audit specific
- db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = audit)
- pc <- suppressWarnings(data.table(dbGetQuery(db_conn, 'SELECT proc_id, consultant_id, responsibility FROM procedures_consultants WHERE is_valid'), key = 'proc_id') )
- dbDisconnect(db_conn)
- # Load common
- db_conn <- dbConnect(MySQL(), group = 'shiny', dbname = 'common')
- cons <- suppressWarnings(data.table(dbReadTable(db_conn, 'consultants'), key = 'consultant_id' ) )
- dbDisconnect(db_conn)
- # Recode
- cons[, gender := factor(gender, levels = c('M', 'W'), labels = c('Male', 'Female'))]
- recode.factors(cons, 'qualification')
- cons[, regStatus := factor(regStatus, levels = lookups[domain_id == 10010, lookup_id ], labels = lookups[domain_id == 10010, description ], ordered = TRUE )]
- }
- #===== SAVE fts DATA ------------------------------------------------------------------------------------------------------------
- write.fst(dt, paste0(data.path, audit, '_dt.fst'), 100 )
- #===== CLEAN AND EXIT -----------------------------------------------------------------------------------------------------------
- rm(list = ls())
- gc()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement