Untitled

rm(list=ls()); gc()  # Cleanup

library(tidyverse)
library(lubridate)
library(caret)

options(tibble.width = Inf)  # Prints all columns in a tibble in the console

#' Create connection to SQL db:
cxn <- DBI::dbConnect(RPostgreSQL::PostgreSQL(),
                      host = "...",
                      port = 5432,
                      dbname = "...",
                      user = "...",
                      password = "...")  # DBI::dbListTables(cxn)

hard_drive_stats <- tbl(cxn, "hard_drive_stats")

#' Get Top-10 most common hard drives:
top_10_drives <- hard_drive_stats %>%
  group_by(model) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  top_n(10) %>%
  collect()

# A tibble: 10 x 2
   model                         n
   <chr>                     <dbl>
 1 ST4000DM000             2822282
 2 HGST HMS5C4040BLE640    1363173
 3 ST12000NM0007           1296465
 4 ST8000NM0055            1293557
 5 ST8000DM002              888774
 6 HGST HMS5C4040ALE640     505045
 7 ST6000DX000              169017
 8 Hitachi HDS5C4040ALE630  115984
 9 ST10000NM0086            109738
10 HGST HUH728080ALE600      94024

#' Get a count, by model, of failures within dataset:
failed_drives <- hard_drive_stats %>%
  filter(failure == 1) %>%
  group_by(model) %>%
  summarise(n_failure = n()) %>%
  arrange(desc(n_failure)) %>%
  collect()  # sum(failed_drives[["n_failure"]])  # Note: we only have 336 instances of failed drives in the entire dataset (8.9MM records)

#' After, JOIN-ing the above together, we confirm that we're dealing with a class imbalance problem (e.g. there are *very* few failures across millions of drives).
#' Thus, we need to make a decision between over- or under-sampling and do k-fold cross-validation.
top_drives_with_failures <- inner_join(top_10_drives, failed_drives, by = "model") %>%
  mutate(pct_failure = n_failure/n) %>%
  arrange(desc(n))  # sum(top_drives_with_failures[["n_failure"]])/sum(top_drives_with_failures[["n"]])  # about 0.003% failure
#' Note: two drive models in the top-10 were excluded from this because they didn't have any failures

> top_drives_with_failures
# A tibble: 8 x 4
  model                      n n_failure pct_failure
  <chr>                  <dbl>     <dbl>       <dbl>
1 ST4000DM000          2822282       178  0.0000631
2 HGST HMS5C4040BLE640 1363173        16  0.0000117
3 ST12000NM0007        1296465        32  0.0000247
4 ST8000NM0055         1293557        28  0.0000216
5 ST8000DM002           888774        21  0.0000236
6 HGST HMS5C4040ALE640  505045         8  0.0000158
7 ST6000DX000           169017         1  0.00000592
8 HGST HUH728080ALE600   94024         3  0.0000319