Advertisement
Guest User

smartmon.sh

a guest
Aug 28th, 2021
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 13.71 KB | None | 0 0
  1. #!/usr/bin/env bash
  2. # Script informed by the collectd monitoring script for smartmontools (using smartctl)
  3. # by Samuel B. <samuel_._behan_(at)_dob_._sk> (c) 2012
  4. # source at: http://devel.dob.sk/collectd-scripts/
  5.  
  6. # TODO: This probably needs to be a little more complex.  The raw numbers can have more
  7. #       data in them than you'd think.
  8. #       http://arstechnica.com/civis/viewtopic.php?p=22062211
  9.  
  10. # Formatting done via shfmt -i 2
  11. # https://github.com/mvdan/sh
  12.  
  13. SMARTCTL=`which smartctl`
  14. #FORCED_DEVICE_LIST=$(cat << EOF
  15. #/dev/sg3|scsi
  16. #/dev/sg4|sat
  17. #/dev/sg5|sat
  18. #/dev/sg6|sat
  19. #/dev/sg7|scsi
  20. #/dev/sg8|sat
  21. #/dev/sg9|sat
  22. #/dev/sdc|sat
  23. #EOF
  24. #)
  25.  
  26. parse_smartctl_attributes_awk="$(
  27.  cat <<'SMARTCTLAWK'
  28. $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
  29.  gsub(/-/, "_");
  30.  printf "%s_value{%s,smart_id=\"%s\"} %d\n", tolower($2), labels, $1, $4
  31.  printf "%s_worst{%s,smart_id=\"%s\"} %d\n", tolower($2), labels, $1, $5
  32.  printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", tolower($2), labels, $1, $6
  33.  printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", tolower($2), labels, $1, $10
  34. }
  35. SMARTCTLAWK
  36. )"
  37.  
  38. smartmon_attrs="$(
  39.  cat <<'SMARTMONATTRS'
  40. airflow_temperature_cel
  41. command_timeout
  42. current_pending_sector
  43. end_to_end_error
  44. erase_fail_count
  45. g_sense_error_rate
  46. hardware_ecc_recovered
  47. host_reads_mib
  48. host_reads_32mib
  49. host_writes_mib
  50. host_writes_32mib
  51. load_cycle_count
  52. media_wearout_indicator
  53. multi_zone_error_rate
  54. wear_leveling_count
  55. nand_writes_1gib
  56. offline_uncorrectable
  57. percent_lifetime_remain
  58. power_cycle_count
  59. power_off_retract_count
  60. power_on_hours
  61. program_fail_count
  62. raw_read_error_rate
  63. reallocated_event_count
  64. reallocated_sector_ct
  65. reallocate_nand_blk_cnt
  66. reported_uncorrect
  67. sata_downshift_count
  68. seek_error_rate
  69. spin_retry_count
  70. spin_up_time
  71. start_stop_count
  72. temperature_case
  73. temperature_celsius
  74. temperature_internal
  75. total_lbas_read
  76. total_lbas_written
  77. total_host_sector_write
  78. udma_crc_error_count
  79. unsafe_shutdown_count
  80. workld_host_reads_perc
  81. workld_media_wear_indic
  82. workload_minutes
  83. throughput_performance
  84. seek_time_performance
  85. helium_level
  86. unknown_attribute
  87. SMARTMONATTRS
  88. )"
  89. smartmon_attrs="$(echo ${smartmon_attrs} | xargs | tr ' ' '|')"
  90.  
  91. parse_smartctl_attributes() {
  92.   local disk="$1"
  93.   local disk_type="$2"
  94.   local name="$3"
  95.   local serial_number="$4"
  96.   local labels="disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\",serial_number=\"${serial_number}\""
  97.   local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
  98.   sed 's/^ \+//g' |
  99.     awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null |
  100.     grep -E "(${smartmon_attrs})"
  101. }
  102.  
  103. parse_smartctl_scsi_attributes() {
  104.   local disk="$1"
  105.   local disk_type="$2"
  106.   local name="$3"
  107.   local labels="disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\""
  108.   while read line; do
  109.     attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
  110.     attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
  111.     case "${attr_type}" in
  112.     number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
  113.     Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
  114.     Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
  115.     Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
  116.     Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
  117.     Non-medium_error_count) non_medium="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
  118.     read) read_uncorrected="$(echo ${attr_value} | awk '{ printf "%e\n", $7 }')" ;;
  119.     write) write_uncorrected="$(echo ${attr_value} | awk '{ printf "%e\n", $7 }')" ;;
  120.     verify) verify_uncorrected="$(echo ${attr_value} | awk '{ printf "%e\n", $7 }')" ;;
  121.     esac
  122.   done
  123.   [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
  124.   [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
  125.   [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}"
  126.   [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}"
  127.   [ ! -z "$grown_defects" ] && echo "sas_grown_defects_count_raw_value{${labels},smart_id=\"0\"} ${grown_defects}"
  128.   [ ! -z "$non_medium" ] && echo "sas_non_medium_errors_count_raw_value{${labels},smart_id=\"0\"} ${non_medium}"
  129.   [ ! -z "$read_uncorrected" ] && echo "sas_read_uncorrected_errors_count_raw_value{${labels},smart_id=\"0\"} ${read_uncorrected}"
  130.   [ ! -z "$write_uncorrected" ] && echo "sas_write_uncorrected_errors_count_raw_value{${labels},smart_id=\"0\"} ${write_uncorrected}"
  131.   [ ! -z "$verify_uncorrected" ] && echo "sas_verify_uncorrected_errors_count_raw_value{${labels},smart_id=\"0\"} ${verify_uncorrected}"
  132. }
  133.  
  134. parse_smartctl_nvme_attributes() {
  135.   local disk="$1"
  136.   local disk_type="$2"
  137.   local name="$3"
  138.   local serial_number="$4"
  139.   local labels="disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\",serial_number=\"${serial_number}\""
  140.  
  141.   while read line; do
  142.     attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
  143.     attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
  144.     case "${attr_type}" in
  145.     Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
  146.     Percentage_Used) wear_level="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
  147.     Host_Read_Commands) host_read_commands="$(echo ${attr_value} | awk '/[0-9]/{gsub ( /,/,"" ) } ; 1' | awk '{ printf "%e\n", $1 }')" ;;
  148.     Host_Write_Commands) host_write_commands="$(echo ${attr_value} | awk '/[0-9]/{gsub ( /,/,"" ) } ; 1' | awk '{ printf "%e\n", $1 }')" ;;
  149.     Data_Units_Read) data_read="$(echo ${attr_value}| sed -E 's/\s+.+$//g' | tr -d ','| awk '{printf "%e\n", $1}')" ;;
  150.     Data_Units_Written) data_written="$(echo ${attr_value}| sed -E 's/\s+.+$//g' | tr -d ','| awk '{printf "%e\n", $1}')" ;;
  151.     Power_Cycles) power_cycles="$(echo ${attr_value} | awk '/[0-9]/{gsub ( /,/,"" ) } ; 1' | awk '{ printf "%e\n", $1 }')" ;;
  152.     Power_On_Hours) power_on="$(echo ${attr_value} | awk '/[0-9]/{gsub ( /,/,"" ) } ; 1' | awk '{ printf "%e\n", $1 }')" ;;
  153.     Unsafe_Shutdowns) unsafe_shutdown_count="$(echo ${attr_value} | awk '/[0-9]/{gsub ( /,/,"" ) } ; 1' | awk '{ printf "%e\n", $1 }')" ;;
  154.     esac
  155.   done
  156.   [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
  157.   [ ! -z "$wear_level" ] && echo "wear_level_raw_value{${labels},smart_id=\"0\"} ${wear_level}"
  158.   [ ! -z "$host_read_commands" ] && echo "host_read_commands_raw_value{${labels},smart_id=\"0\"} ${host_read_commands}"
  159.   [ ! -z "$host_write_commands" ] && echo "host_write_commands_raw_value{${labels},smart_id=\"0\"} ${host_write_commands}"
  160.   [ ! -z "$data_read" ] && echo "data_units_read_raw_value{${labels},smart_id=\"0\"} ${data_read}"
  161.   [ ! -z "$data_written" ] && echo "data_units_written_raw_value{${labels},smart_id=\"0\"} ${data_written}"
  162.   [ ! -z "$power_cycles" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycles}"
  163.   [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
  164.   [ ! -z "$unsafe_shutdown_count" ] && echo "unsafe_shutdowns_raw_value{${labels},smart_id=\"9\"} ${unsafe_shutdown_count}"
  165. }
  166.  
  167. parse_smartctl_info() {
  168.   local -i smart_available=0 smart_enabled=0 smart_healthy=0
  169.   local disk="$1" disk_type="$2" name="$3"
  170.   local model_family='N/A' device_model='N/A' size='N/A' serial_number='N/A' fw_version='N/A' vendor='N/A' product='N/A' revision='N/A' lun_id='N/A'
  171.   while read line; do
  172.     info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')"
  173.     info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')"
  174.     case "${info_type}" in
  175.     Model_Family) model_family="${info_value}" ;;
  176.     Device_Model) device_model="${info_value}" ;;
  177.     Model_Number) device_model="${info_value}" ;;
  178.     Serial_[Nn]umber) serial_number="${info_value}" ;;
  179.     Firmware_Version) fw_version="${info_value}" ;;
  180.     User_Capacity) size="$(echo ${info_value}| sed -E 's/\s+.+$//g' | tr -d ','| awk '{printf "%d GB\n", $1/1024/1024/1024}')" ;;
  181.     Namespace_1_Size/Capacity) size="$(echo ${info_value}| sed -E 's/\s+.+$//g' | tr -d ','| awk '{printf "%d GB\n", $1/1024/1024/1024}')" ;;
  182.     Vendor) vendor="${info_value}" ;;
  183.     Product) product="${info_value}" ;;
  184.     Revision) revision="${info_value}" ;;
  185.     Logical_Unit_id) lun_id="${info_value}" ;;
  186.     esac
  187.     if [[ "${info_type}" == 'SMART_support_is' ]]; then
  188.       val=$(echo $info_value | awk '{$1=$1k;print}')
  189.       case "${val:0:7}" in
  190.       Enabled) smart_enabled=1 ;;
  191.       Availab) smart_available=1 ;;
  192.       Unavail) smart_available=0 ;;
  193.       esac
  194.     fi
  195.     if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then
  196.       info_value=`echo ${info_value}| tr -d ' '`
  197.       case "${info_value}" in
  198.       PASSED) smart_healthy=1 ;;
  199.       esac
  200.     elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then
  201.       info_value=`echo ${info_value}| tr -d ' '`
  202.       case "${info_value}" in
  203.       OK) smart_healthy=1 ;;
  204.       esac
  205.     fi
  206.   done
  207.   if [[ $device_model == 'N/A' ]] && ([[ $vendor != 'N/A' ]] || [[ $product != 'N/A'  ]])
  208.     then
  209.     device_model="${vendor} $product"
  210.   fi
  211.  
  212.  
  213.   disk=$(echo $disk | awk '{$1=$1k;print}')
  214.   name=$(echo $name | awk '{$1=$1k;print}')
  215.   vendor=$(echo $vendor | awk '{$1=$1k;print}')
  216.   model_family=$(echo $model_family | awk '{$1=$1k;print}')
  217.   device_model=$(echo $device_model | awk '{$1=$1k;print}')
  218.   serial_number=$(echo $serial_number | awk '{$1=$1k;print}')
  219.   fw_version=$(echo $fw_version | awk '{$1=$1k;print}')
  220.  
  221.   echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",size=\"${size}\",firmware_version=\"${fw_version}\",smart_healthy=\"${smart_healthy}\"} 1"
  222.   echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\",serial_number=\"${serial_number}\"} ${smart_available}"
  223.   echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\",serial_number=\"${serial_number}\"} ${smart_enabled}"
  224.   echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",name=\"${name}\",serial_number=\"${serial_number}\"} ${smart_healthy}"
  225. }
  226.  
  227. output_format_awk="$(
  228.  cat <<'OUTPUTAWK'
  229. BEGIN { v = "" }
  230. v != $1 {
  231.  print "# HELP smartmon_" $1 " SMART metric " $1;
  232.   print "# TYPE smartmon_" $1 " gauge";
  233.   v = $1
  234. }
  235. {print "smartmon_" $0}
  236. OUTPUTAWK
  237. )"
  238.  
  239. format_output() {
  240.  sort |
  241.    awk -F'{' "${output_format_awk}"
  242. }
  243.  
  244. smartctl_version="$($SMARTCTL -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
  245.  
  246. echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output
  247.  
  248. if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then
  249.  exit
  250. fi
  251.  
  252. device_list=
  253. if [[ -z $FORCED_DEVICE_LIST ]]
  254.  then
  255.  device_list="$($SMARTCTL --scan-open | awk '/^\/dev/{print $1 "|" $3}')"
  256. else
  257.  device_list=$FORCED_DEVICE_LIST
  258. fi
  259.  
  260. for device in ${device_list}; do
  261.  disk="$(echo ${device} | cut -f1 -d'|')"
  262.  type="$(echo ${device} | cut -f2 -d'|')"
  263.  disk=$(echo $disk | awk '{$1=$1k;print}')
  264.  type=$(echo $type | awk '{$1=$1k;print}')
  265.  
  266.  serial_number=`$SMARTCTL -i -d "${type}" "${disk}" | awk -F ':' '/[Ss]erial [Nn]umber/ {print $2}'| sed -E 's/^\s+//g'`
  267.  serial_number=$(echo $serial_number | awk '{$1=$1k;print}')
  268.  
  269.  active=1
  270.  # Check if the device is in a low-power mode
  271.  $SMARTCTL -n standby -d "${type}" "${disk}" > /dev/null || active=0
  272.  echo "device_active{disk=\"${disk}\",type=\"${type}\",serial_number=\"${serial_number}\"}" "${active}"
  273.  # Skip further metrics to prevent the disk from spinning up
  274.  test ${active} -eq 0 && continue
  275.  # Get Device name label
  276.  name=""
  277.  case ${type} in
  278.    scsi)
  279.      vendor=`$SMARTCTL -i -d "${type}" "${disk}" | awk -F ':' '/Vendor/ {print $2}'| sed -E 's/^\s+//g'`
  280.      product=`$SMARTCTL -i -d "${type}" "${disk}" | awk -F ':' '/Product/ {print $2}'| sed -E 's/^\s+//g'`
  281.      name="${vendor} ${product}" ;;
  282.     nvme)
  283.      name=`$SMARTCTL -i -d "${type}" "${disk}" | awk -F ':' '/Model Number/ {print $2}'| sed -E 's/^\s+//g'` ;;
  284.    *)
  285.      name=`$SMARTCTL -i -d "${type}" "${disk}" | awk -F ':' '/Device Model/ {print $2}'| sed -E 's/^\s+//g'` ;;
  286.  esac
  287.  
  288.  
  289.  name=$(echo $name | awk '{$1=$1k;print}')
  290.  echo "smartctl_run{disk=\"${disk}\",type=\"${type}\",name=\"${name}\",serial_number=\"${serial_number}\"}" "$(TZ=UTC date '+%s')"
  291.  # Get the SMART information and health
  292.  $SMARTCTL -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" "${name}"
  293.  # Get the SMART attributes
  294.  case ${type} in
  295.    sat) $SMARTCTL -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${name}" "${serial_number}" ;;
  296.    nvme) $SMARTCTL -A -d "${type}" "${disk}" | parse_smartctl_nvme_attributes "${disk}" "${type}" "${name}" "${serial_number}" ;;
  297.    atacam) $SMARTCTL -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${name}" "${serial_number}" ;;
  298.    sat+megaraid*) $SMARTCTL -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${name}" "${serial_number}" ;;
  299.    scsi) $SMARTCTL -a -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${name}" ;;
  300.    megaraid*) $SMARTCTL -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${name}" ;;
  301.    *)
  302.      continue
  303.      ;;
  304.  esac
  305. done | format_output
  306.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement