Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- Desc: Analysis of backblaze SMART drive data using DuckDB
- -- URL: https://www.backblaze.com/cloud-storage/resources/hard-drive-test-data
- -- Auth: Brian Dill
- -- Date: 2025-03-23
- -- create/attach to DB (My D:\ drive is a 2.5" SSD)
- Attach "D:/DuckDB/backblaze.duckdb";
- -- ================================================================================
- --DROP TABLE drive_days;
- CREATE TABLE drive_days (
- DATE DATE
- , serial_number VARCHAR
- , model VARCHAR
- , capacity_bytes BIGINT
- , failure BIGINT
- -- , smart_5_normalized VARCHAR
- -- , smart_5_raw BIGINT
- -- , smart_187_normalized VARCHAR
- -- , smart_187_raw BIGINT
- -- , smart_188_normalized VARCHAR
- -- , smart_188_raw BIGINT
- -- , smart_197_normalized VARCHAR
- -- , smart_197_raw BIGINT
- -- , smart_198_normalized VARCHAR
- -- , smart_198_raw BIGINT
- );
- -- ================================================================================
- -- Load CSV files
- -- My choice of "ORDER BY model, serial_number, date" is to attain maximum columnar compression by ordering by the lowest cardinality first
- -- My K:\ drive IS a WD RED 4GB spindle HDD AND IS WHERE I unzipped ALL OF the *.csv files.
- SET VARIABLE path = "K:/R/Backblaze/";
- /*
- --CREATE OR REPLACE TABLE drive_days AS SELECT * FROM read_csv_auto(CONCAT( getvariable(PATH), "2014/*.csv"), header=True, filename=False)
- -- 2.8 GB in 1:08 (the csv files total 2.8 GB and the import took 1 min and 8 seconds)
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2014/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 4.2 GB in 1:22
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2015/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 6.0 GB in 1:55
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2016/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 7.6 GB in 2:34
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2017/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 10.0 GB in 3:10
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2018/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 11.8 GB in 3:28
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2019/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 15.5 GB in 5:30
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2020/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 21.5 GB in 13:36
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2021/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 26.9 GB in 25:35
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2022/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- the import started taking too long (exceeding my 32GB RAM), so I sub-divided the CSV files into quarters rather than year
- -- 7.1 GB in 1:21
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q1/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 7.6 GB in 1:26
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q2/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 8.7 GB in 1:40
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q3/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 9.1 GB in 1:49
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q4/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 9.3 GB in 1:49
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q1/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 9.6 GB in 1:53
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q2/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 9.9 GB in 1:59
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q3/*.csv"), header=True) ORDER BY model, serial_number, date;
- -- 10.2 GB in 2:04
- INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q4/*.csv"), header=True) ORDER BY model, serial_number, date;
- */
- -- ================================================================================
- SELECT * FROM main.drive_days LIMIT 10;
- DESCRIBE drive_days;
- SUMMARIZE drive_days; -- 0:34
- -- Total count - almost 600 Million rows
- SELECT COUNT(*) AS N FROM main.drive_days; -- 559,474,362
- -- Count by year
- SELECT YEAR(DATE) AS "year", COUNT(*) AS N FROM main.drive_days GROUP BY YEAR(DATE); -- < 1 sec
- -- count by day
- SELECT DATE, COUNT(*) AS N FROM main.drive_days GROUP BY DATE; -- < 1 sec
- SELECT DATE, model, COUNT(*) AS N FROM main.drive_days GROUP BY DATE, model; -- < 1 sec
- SELECT DATE, S.make, COUNT(*) AS N FROM main.drive_days AS DD JOIN main.serials AS S ON S.model = DD.model GROUP BY DATE, S.make; -- < 1 sec
- -- What are the list of models?
- SELECT model, COUNT(*) AS model_drive_days FROM main.drive_days GROUP BY ALL ORDER BY 2 DESC; -- 171 models
- -- how many models were recorded for each day?
- SELECT model, DATE, COUNT(*) AS N FROM main.drive_days GROUP BY ALL HAVING COUNT(*) > 100 ORDER BY model, DATE;
- -- ================================================================================
- -- Export each year's data to a parquet file.
- COPY (SELECT * FROM drive_days WHERE DATE >= '2014-01-01' AND DATE < '2015-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2014.parquet" (FORMAT parquet); -- 0:06
- COPY (SELECT * FROM drive_days WHERE DATE >= '2015-01-01' AND DATE < '2016-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2015.parquet" (FORMAT parquet); -- 0:08
- COPY (SELECT * FROM drive_days WHERE DATE >= '2016-01-01' AND DATE < '2017-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2016.parquet" (FORMAT parquet); -- 0:12
- COPY (SELECT * FROM drive_days WHERE DATE >= '2017-01-01' AND DATE < '2018-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2017.parquet" (FORMAT parquet); -- 0:16
- COPY (SELECT * FROM drive_days WHERE DATE >= '2018-01-01' AND DATE < '2019-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2018.parquet" (FORMAT parquet); -- 0:22
- COPY (SELECT * FROM drive_days WHERE DATE >= '2019-01-01' AND DATE < '2020-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2019.parquet" (FORMAT parquet); -- 0:29
- COPY (SELECT * FROM drive_days WHERE DATE >= '2020-01-01' AND DATE < '2021-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2020.parquet" (FORMAT parquet); -- 0:40
- COPY (SELECT * FROM drive_days WHERE DATE >= '2021-01-01' AND DATE < '2022-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2021.parquet" (FORMAT parquet); -- 1:15
- COPY (SELECT * FROM drive_days WHERE DATE >= '2022-01-01' AND DATE < '2023-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2022.parquet" (FORMAT parquet); -- 1:23
- COPY (SELECT * FROM drive_days WHERE DATE >= '2023-01-01' AND DATE < '2024-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2023.parquet" (FORMAT parquet); -- 1:46
- COPY (SELECT * FROM drive_days WHERE DATE >= '2024-01-01' AND DATE < '2025-01-01' ORDER BY model, serial_number, DATE) TO "D:/DuckDB/drive_days_2024.parquet" (FORMAT parquet); -- 2:15
- -- ================================================================================
- -- Create the "serials" table to store one row per serial number.
- -- DROP TABLE main.serials;
- CREATE TABLE main.serials (
- make VARCHAR -- derive FROM the model
- , model VARCHAR
- , serial_number VARCHAR
- , capacity_bytes BIGINT
- , capacity_gb DECIMAL(12,3) -- calc TO have an easier number TO grasp
- , failure BIGINT -- Did this SPECIFIC drive fail?
- , min_date DATE -- what IS the FIRST recorded date OF service?
- , max_date DATE -- what IS the LAST recorded date OF service?
- , total_days INT -- how many days was it IN service?
- );
- INSERT INTO main.serials -- time TO load: 0:18 (56 sec when using median)
- SELECT DISTINCT
- MAX(CASE WHEN LEFT(model, 3) = 'HGS' THEN 'Hitachi'
- WHEN LEFT(model, 3) = 'Hit' THEN 'Hitachi'
- WHEN LEFT(model, 3) = 'SAM' THEN 'Samsung'
- WHEN LEFT(model, 3) = 'Sam' THEN 'Samsung'
- WHEN LEFT(model, 2) = 'ST' THEN 'Seagate'
- WHEN LEFT(model, 3) = 'TOS' THEN 'Toshiba'
- WHEN LEFT(model, 3) = 'WDC' THEN 'Western Digital'
- WHEN LEFT(model, 2) = 'WD' THEN 'Western Digital'
- WHEN LEFT(model, 3) = 'DEL' THEN 'DELL'
- WHEN LEFT(model, 3) = 'Sea' THEN 'Seagate'
- WHEN LEFT(model, 3) = 'MTF' THEN 'Micron'
- WHEN LEFT(model, 3) = 'Mic' THEN 'Micron'
- WHEN LEFT(model, 2) = 'CT' THEN 'Crucial' END
- ) AS make
- , MAX(model) AS model
- , serial_number
- , median(capacity_bytes) AS capacity_bytes
- , median(capacity_bytes) / (1024 * 1024 * 1024) AS capacity_GB
- , MAX(failure) AS failure
- , MIN(DATE) AS min_date
- , MAX(DATE) AS max_date
- , datediff('d', MIN(DATE), MAX(DATE)) AS total_days
- FROM main.drive_days
- GROUP BY serial_number
- ORDER BY make, model, serial_number;
- -- COPY (SELECT * FROM serials ORDER BY make, model, serial_number) TO "D:/DuckDB/serials.parquet" (FORMAT parquet); -- < 1 sec
- -- COPY (SELECT * FROM serials ORDER BY make, model, serial_number) TO "D:/DuckDB/serials.csv" (FORMAT csv); -- < 1sec
- -- ================================================================================
- SELECT * FROM main.serials LIMIT 100;
- SELECT COUNT(*) AS N FROM main.serials; -- Gross count: 444,500
- SELECT COUNT(*) AS N FROM main.serials WHERE failure = 0 AND max_date = '2024-12-31'; -- active: 305,029 (not failed and running on last day)
- SELECT COUNT(*) AS N FROM main.serials WHERE failure = 0 AND max_date < '2024-12-31'; -- retired: 113,444
- SELECT COUNT(*) AS N FROM main.serials WHERE failure = 1; -- Failed: 26,027
- SELECT * FROM main.serials WHERE failure = 1 ORDER BY make, model, total_days DESC ;
- -- How many drives of each make
- SELECT make, COUNT(serial_number) AS NumOfDrives, MIN(min_date) AS first_used, MAX(max_date) AS last_used--, sum(failure) AS failures
- FROM main.serials GROUP BY make ORDER BY NumOfDrives DESC;
- -- How many drives of each model (167 models)
- SELECT make, model
- , COUNT(serial_number) AS NumOfDrives
- , MEDIAN(capacity_gb::INT) AS gb
- , MIN(min_date) AS first_used
- , CASE MAX(max_date) WHEN '2024-12-31' THEN NULL ELSE MAX(max_date) END AS retired_on
- , SUM(total_days) AS total_drive_days
- , (SUM(total_days) * 1.0 / COUNT(serial_number))::INT AS avg_drive_days
- FROM main.serials
- GROUP BY make, model
- HAVING NumOfDrives > 100
- ORDER BY make, NumOfDrives DESC;
- -- 3 outliers in capacity
- SELECT * FROM main.serials ORDER BY capacity_GB DESC ;
- -- ================================================================================
- -- DROP VIEW IF EXISTS vw_serials_status;
- CREATE VIEW main.vw_serials_status
- AS
- SELECT COALESCE(make, 'other') AS make
- , model
- , serial_number
- , capacity_gb::INT AS capacity_gb
- , total_days
- , min_date AS first_used
- , max_date AS last_used
- , failure
- , CASE WHEN max_date = '2024-12-31' AND failure = 0 THEN 1 ELSE 0 END AS is_active
- , CASE WHEN max_date < '2024-12-31' AND failure = 0 THEN 'retired'
- WHEN max_date = '2024-12-31' AND failure = 0 THEN 'live'
- WHEN failure = 1 THEN 'failed'
- ELSE '' END AS STATUS
- FROM main.serials
- WHERE capacity_gb > 0
- ;
- SELECT * FROM vw_serials_status LIMIT 10;
- -- What makes have the most drive days?
- SELECT make
- , SUM(total_days) AS total_days
- , COUNT(*) AS num_drives
- , (SUM(total_days) / COUNT(*))::INT AS avg_days_per_drive
- FROM main.vw_serials_status
- GROUP BY make
- ORDER BY total_days DESC;
- -- What models have the most drive days?
- SELECT make, model, capacity_gb
- , SUM(total_days) AS total_days
- , COUNT(*) AS num_drives
- , (SUM(total_days) / COUNT(*))::INT AS avg_days_per_drive
- , MIN(first_used) AS first_used
- , MAX(last_used) AS last_used
- , CASE WHEN MAX(last_used) < '2024-12-31' THEN (MAX(last_used))::VARCHAR(10) ELSE 'active' END AS retired_on
- FROM main.vw_serials_status
- GROUP BY ALL
- ORDER BY total_days DESC ;
- -- ================================================================================
- -- create make and model views
- -- create view to calculate status and pivot on status by make
- -- ================================================================================
- DROP VIEW IF EXISTS main.vw_serials_make_status_piv;
- CREATE VIEW main.vw_serials_make_status_piv
- AS
- SELECT *
- , (live+failed+retired) AS total
- , (failed * 100.0 / (live+failed+retired))::DECIMAL(5,2) AS pct_failed_drives
- FROM (
- SELECT make, STATUS, COUNT(*) AS N
- FROM main.vw_serials_status
- GROUP BY make, STATUS
- ORDER BY make, STATUS
- ) AS X
- PIVOT (
- COALESCE(MAX(N), 0) FOR STATUS IN ('live', 'failed', 'retired')
- ) AS PIV
- ;
- -- SELECT * FROM main.vw_serials_make_status_piv ORDER BY live DESC ;
- -- ================================================================================
- -- create view to calculate status and pivot on status by model
- DROP VIEW IF EXISTS main.vw_serials_model_status_piv;
- CREATE VIEW main.vw_serials_model_status_piv
- AS
- SELECT *
- , (live+failed+retired) AS total
- , (failed * 100.0 / (live+failed+retired))::DECIMAL(5,2) AS pct_failed_drives
- FROM (
- SELECT make, model, capacity_gb, STATUS, COUNT(*) AS N
- FROM main.vw_serials_status
- GROUP BY make, model, capacity_gb, STATUS
- ORDER BY make, model, capacity_gb, STATUS
- ) AS X
- PIVOT (
- COALESCE(MAX(N), 0) FOR STATUS IN ('live', 'failed', 'retired')
- ) AS PIV
- ;
- -- SELECT * FROM main.vw_serials_model_status_piv ORDER BY live DESC ;
- -- SELECT * FROM main.vw_serials_model_status_piv WHERE model = 'ST12000NM0007' ORDER BY model DESC ; -- one reported as 0GB
- -- ================================================================================
- -- make level analysis
- -- Use pivot make status view to factor in total drive days
- -- ================================================================================
- SELECT V.*
- , S.total_drive_days
- , (failed / (S.total_drive_days / 1000000))::DECIMAL(9,2) AS failures_per_million_drive_days
- FROM vw_serials_make_status_piv AS V
- JOIN (SELECT make, SUM(total_days) AS total_drive_days FROM main.vw_serials_status GROUP BY make) AS S ON S.make = V.make
- ORDER BY V.live DESC
- ;
- -- ================================================================================
- -- model level analysis
- -- Use pivot model status view to factor in total drive days
- -- ================================================================================
- SELECT V.*
- , S.total_drive_days
- , (failed / (S.total_drive_days / 1000000))::DECIMAL(9,2) AS failures_per_million_drive_days
- , S.first_used::VARCHAR(10) AS first_used
- --, S.last_used::varchar(10) AS last_used
- , CASE WHEN S.last_used = '2024-12-31' THEN 'active' ELSE S.last_used::VARCHAR(10) END AS retired_on
- FROM vw_serials_model_status_piv AS V
- JOIN (SELECT make, model
- , SUM(total_days) AS total_drive_days
- , MIN(first_used) AS first_used
- , MAX(last_used) AS last_used
- FROM main.vw_serials_status
- GROUP BY make, model) AS S ON S.model = V.model
- --WHERE V.model = 'ST12000NM0007'
- WHERE total > 100
- ORDER BY failures_per_million_drive_days
- --ORDER BY V.live DESC
- ;
- /*
- ID SMART Attribute Description
- 1 Raw Read Error Rate Rate of hardware read errors when accessing data.
- 2 Throughput Performance Overall performance of the HDD (lower values indicate issues).
- 3 Spin-Up Time Time it takes for the drive to reach full speed.
- 4 Start/Stop Count Number of times the HDD has been started or stopped.
- 5 Reallocated Sectors Count Number of bad sectors replaced with spare sectors (higher = failing drive).
- 6 Read Channel Margin Internal measurement of read performance.
- 7 Seek Error Rate Frequency of seek errors when positioning the drive head.
- 8 Seek Time Performance Average time taken for seek operations.
- 9 Power-On Hours (POH) Total number of hours the HDD has been powered on.
- 10 Spin Retry Count Number of times the drive had to retry spinning up.
- 11 Calibration Retry Count Number of failed calibration attempts.
- 12 Power Cycle Count Number of times the HDD has been power-cycled (turned off/on).
- 13 Soft Read Error Rate Number of corrected read errors (not always used).
- 183 SATA Downshift Error Count Number of times the SATA link had to be downgraded.
- 184 End-to-End Error Data integrity errors in the HDD cache/RAM.
- 187 Reported Uncorrectable Errors Number of errors that could not be corrected.
- 188 Command Timeout Number of commands that timed out due to hardware issues.
- 189 High Fly Writes Occurs when the drive head is positioned incorrectly.
- 190 Airflow Temperature (or Drive Temperature) Current temperature of the HDD (critical if too high).
- 191 G-Sense Error Rate Number of errors due to shocks or vibrations.
- 192 Power-Off Retract Count Number of times the drive head was parked due to power loss.
- 193 Load Cycle Count Number of times the drive head has been loaded/unloaded.
- 194 Temperature Temperature of the HDD in degrees Celsius.
- 195 Hardware ECC Recovered Number of errors corrected by hardware Error Correction Code (ECC).
- 196 Reallocation Event Count Number of times a bad sector was replaced.
- 197 Current Pending Sector Count Number of unstable sectors awaiting reallocation (higher = bad).
- 198 Uncorrectable Sector Count Number of sectors that could not be recovered (bad sign).
- 199 UltraDMA CRC Error Count Data transfer errors due to faulty cables or connectors.
- 200 Write Error Rate Rate of errors during write operations.
- 201 Soft Read Error Rate Corrected errors during reads.
- 202 Data Address Mark Errors Errors in address marking of the disk.
- 220 Disk Shift Measures disk platter movement due to impact/shock.
- 222 Loaded Hours Number of hours the HDD head has been loaded.
- 223 Load Retry Count Number of failed attempts to load the drive head.
- 224 Load Friction Measures resistance while loading the HDD head.
- 226 Load-in Time Time taken for the HDD head to load.
- 240 Head Flying Hours Total hours the read/write head has been operational.
- 241 Total LBAs Written Total data written to the HDD in Logical Block Addresses (LBAs).
- 242 Total LBAs Read Total data read from the HDD in LBAs.
- Key Indicators for Drive Failure
- If any of these values are high or increasing, your drive may be failing:
- Reallocated Sectors Count (ID 5) → Bad sectors that were replaced.
- Current Pending Sector Count (ID 197) → Unstable sectors waiting to be remapped.
- Uncorrectable Sector Count (ID 198) → Sectors that cannot be recovered.
- Reported Uncorrectable Errors (ID 187) → High values indicate serious errors.
- Command Timeout (ID 188) → Indicates drive delays or failures.
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement