Guest User

Untitled

a guest
Jan 16th, 2019
140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.62 KB | None | 0 0
  1. rm(list=ls()); gc() # Cleanup
  2.  
  3. library(tidyverse)
  4. library(lubridate)
  5. library(caret)
  6.  
  7. options(tibble.width = Inf) # Prints all columns in a tibble in the console
  8.  
  9. #' Create connection to SQL db:
  10. cxn <- DBI::dbConnect(RPostgreSQL::PostgreSQL(),
  11. host = "...",
  12. port = 5432,
  13. dbname = "...",
  14. user = "...",
  15. password = "...") # DBI::dbListTables(cxn)
  16.  
  17. hard_drive_stats <- tbl(cxn, "hard_drive_stats")
  18.  
  19. #' Get Top-10 most common hard drives:
  20. top_10_drives <- hard_drive_stats %>%
  21. group_by(model) %>%
  22. summarise(n = n()) %>%
  23. arrange(desc(n)) %>%
  24. top_n(10) %>%
  25. collect()
  26.  
  27. # A tibble: 10 x 2
  28. model n
  29. <chr> <dbl>
  30. 1 ST4000DM000 2822282
  31. 2 HGST HMS5C4040BLE640 1363173
  32. 3 ST12000NM0007 1296465
  33. 4 ST8000NM0055 1293557
  34. 5 ST8000DM002 888774
  35. 6 HGST HMS5C4040ALE640 505045
  36. 7 ST6000DX000 169017
  37. 8 Hitachi HDS5C4040ALE630 115984
  38. 9 ST10000NM0086 109738
  39. 10 HGST HUH728080ALE600 94024
  40.  
  41. #' Get a count, by model, of failures within dataset:
  42. failed_drives <- hard_drive_stats %>%
  43. filter(failure == 1) %>%
  44. group_by(model) %>%
  45. summarise(n_failure = n()) %>%
  46. arrange(desc(n_failure)) %>%
  47. collect() # sum(failed_drives[["n_failure"]]) # Note: we only have 336 instances of failed drives in the entire dataset (8.9MM records)
  48.  
  49. #' After, JOIN-ing the above together, we confirm that we're dealing with a class imbalance problem (e.g. there are *very* few failures across millions of drives).
  50. #' Thus, we need to make a decision between over- or under-sampling and do k-fold cross-validation.
  51. top_drives_with_failures <- inner_join(top_10_drives, failed_drives, by = "model") %>%
  52. mutate(pct_failure = n_failure/n) %>%
  53. arrange(desc(n)) # sum(top_drives_with_failures[["n_failure"]])/sum(top_drives_with_failures[["n"]]) # about 0.003% failure
  54. #' Note: two drive models in the top-10 were excluded from this because they didn't have any failures
  55.  
  56. > top_drives_with_failures
  57. # A tibble: 8 x 4
  58. model n n_failure pct_failure
  59. <chr> <dbl> <dbl> <dbl>
  60. 1 ST4000DM000 2822282 178 0.0000631
  61. 2 HGST HMS5C4040BLE640 1363173 16 0.0000117
  62. 3 ST12000NM0007 1296465 32 0.0000247
  63. 4 ST8000NM0055 1293557 28 0.0000216
  64. 5 ST8000DM002 888774 21 0.0000236
  65. 6 HGST HMS5C4040ALE640 505045 8 0.0000158
  66. 7 ST6000DX000 169017 1 0.00000592
  67. 8 HGST HUH728080ALE600 94024 3 0.0000319
Add Comment
Please, Sign In to add comment