Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rm(list=ls()); gc() # Cleanup
- library(tidyverse)
- library(lubridate)
- library(caret)
- options(tibble.width = Inf) # Prints all columns in a tibble in the console
- #' Create connection to SQL db:
- cxn <- DBI::dbConnect(RPostgreSQL::PostgreSQL(),
- host = "...",
- port = 5432,
- dbname = "...",
- user = "...",
- password = "...") # DBI::dbListTables(cxn)
- hard_drive_stats <- tbl(cxn, "hard_drive_stats")
- #' Get Top-10 most common hard drives:
- top_10_drives <- hard_drive_stats %>%
- group_by(model) %>%
- summarise(n = n()) %>%
- arrange(desc(n)) %>%
- top_n(10) %>%
- collect()
- # A tibble: 10 x 2
- model n
- <chr> <dbl>
- 1 ST4000DM000 2822282
- 2 HGST HMS5C4040BLE640 1363173
- 3 ST12000NM0007 1296465
- 4 ST8000NM0055 1293557
- 5 ST8000DM002 888774
- 6 HGST HMS5C4040ALE640 505045
- 7 ST6000DX000 169017
- 8 Hitachi HDS5C4040ALE630 115984
- 9 ST10000NM0086 109738
- 10 HGST HUH728080ALE600 94024
- #' Get a count, by model, of failures within dataset:
- failed_drives <- hard_drive_stats %>%
- filter(failure == 1) %>%
- group_by(model) %>%
- summarise(n_failure = n()) %>%
- arrange(desc(n_failure)) %>%
- collect() # sum(failed_drives[["n_failure"]]) # Note: we only have 336 instances of failed drives in the entire dataset (8.9MM records)
- #' After, JOIN-ing the above together, we confirm that we're dealing with a class imbalance problem (e.g. there are *very* few failures across millions of drives).
- #' Thus, we need to make a decision between over- or under-sampling and do k-fold cross-validation.
- top_drives_with_failures <- inner_join(top_10_drives, failed_drives, by = "model") %>%
- mutate(pct_failure = n_failure/n) %>%
- arrange(desc(n)) # sum(top_drives_with_failures[["n_failure"]])/sum(top_drives_with_failures[["n"]]) # about 0.003% failure
- #' Note: two drive models in the top-10 were excluded from this because they didn't have any failures
- > top_drives_with_failures
- # A tibble: 8 x 4
- model n n_failure pct_failure
- <chr> <dbl> <dbl> <dbl>
- 1 ST4000DM000 2822282 178 0.0000631
- 2 HGST HMS5C4040BLE640 1363173 16 0.0000117
- 3 ST12000NM0007 1296465 32 0.0000247
- 4 ST8000NM0055 1293557 28 0.0000216
- 5 ST8000DM002 888774 21 0.0000236
- 6 HGST HMS5C4040ALE640 505045 8 0.0000158
- 7 ST6000DX000 169017 1 0.00000592
- 8 HGST HUH728080ALE600 94024 3 0.0000319
Add Comment
Please, Sign In to add comment