Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # 2015-2016 by ghost75 v1.61
- # Tested on Nas4free and Linux.
- maxusedperc="80"
- minfreegig="20" #only Gigabytes and digits allowed
- maxwarncountperday="1" #how many disk space warnings per day? 0 to disable space warning mail
- scrubexpire="1728000" #after how many seconds scrub will expire: 7d x 24h x 3600s = 604800
- scrubrunhour="22" #run scrub off peak time, 24h time format
- spacewarncountfile="/tmp/spacewarn_count.log" #to get track when to send mail
- dayfile="/tmp/spacewarn_day.log" #to store actual day
- poolmailfile="/tmp/pool.log" #log related to zfs pool
- smarterrorfile="/tmp/smarterrors.log" #log includes smart attribute and unhealthy state if disk has error
- smartcache="/tmp/smartcache.log" #stores output of smartctl
- #smartctl device options
- sdc="sat"
- #email address needs only to be set if you dont want to use the values from
- #Nas4Free WebUI (System|Advanced|Email and Disk|Management|Smart or Status|Email Report)
- #of if you have non NAS4Free system
- email_to="root" #required on non Nas4free
- #email_from=""
- ############################################################
- PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
- source $HOME/.profile
- send_mail() {
- if [ "$platform" == "FreeBSD" ] && [ ! -z $config ]; then
- printf "From:$email_from\nTo:$email_to\nSubject:$subject\n\n$body" | /usr/local/bin/msmtp --file=/var/etc/msmtp.conf -t
- elif [ "$platform" == "Unix" ] || [ "$platform" == "Linux" ] || [ "$platform" == "FreeBSD" ]; then
- if [ ! -z "$email_to" ]; then
- mail -s "$subject" "$email_to" <<< "$body"
- else
- echo "Cannot send mail because parameter email_to is missing, will try to send again next time" >&2
- fi
- fi
- }
- run_help() {
- echo -e "\n Monitor your ZFS pools\n"
- echo " Script will monitor disk space and send email if thresholds are reached,"
- echo " it will send mail only a specific amount of times per day, which can be set"
- echo " by parameter."
- echo " Also there will be an email for any zfs pools which are not in online state"
- echo " (i.e. degraded) or those with chksum errors. There will be an email once for"
- echo " every new error after the old error is fixed."
- echo " Scrub will be also taken place on a regular basis (see scrubexpire parameter)"
- echo " if there are no chksum errors, scrub errors from last run and if pool is ONLINE"
- echo " and not in resilver state."
- echo " S.M.A.R.T. attributes id5,10,196,197,198 will be monitored for any raw value > 0."
- echo " There will be email for every new disk error but no reoccuring mail if the"
- echo -e " error stays the same. If the script cannot send mail, it will try next time.\n"
- echo " The following parameters are supported:"
- echo " -e scrubexpire, how many seconds there will be no new scrub"
- echo " -f email_from"
- echo " -g minfreegig, for disk space monitoring"
- echo " -p maxusedperc, for disk space monitoring"
- echo " -r scrubrunhour, in 24h format"
- echo -e " -t email_to \n"
- echo " It can be scheduled with cron to run like every 5min i.e.:"
- echo " */5 * * * * root /usr/local/sbin/zfshealth"
- echo -e " You may need also to set PATH variable in cron file.\n"
- exit 0
- }
- #get parameters if specified
- while getopts e:f:g:hp:r:t: option; do
- case "${option}" in
- e) scrubexpire=${OPTARG};;
- f) email_from=${OPTARG};;
- g) minfreegig=${OPTARG};;
- h) run_help;;
- p) maxusedperc=${OPTARG};;
- r) scrubrunhour=${OPTARG};;
- t) email_to=${OPTARG};;
- esac
- done
- ##which platform are we?
- if [ "$(uname)" == "Linux" ]; then
- platform="Linux"
- myhostname=$(hostname)
- elif [ "$(uname)" == "FreeBSD" ]; then
- platform="FreeBSD"
- myhostname=$HOST
- elif [ "$(uname)" == "Unix" ]; then
- platform="Unix"
- myhostname=$HOST
- else
- platform="unknown"
- myhostname="unknown"
- echo "OS is not supported, exit here"
- exit 1
- fi
- ###########################
- # Checking days (for limiting space warning mails)
- daynow=$(date +"%d")
- #if file doesnt exist, put date there
- if [ ! -f "$dayfile" ]; then
- echo $daynow > $dayfile
- fi
- #get day from file to know when the last runtime was
- read -r dayfromfile <<< $(cat $dayfile)
- #write actual day into file
- echo $daynow > $dayfile
- #######################
- #this is used for scrub expire
- currentscrubdate=$(date +"%s")
- #######################
- # Find out if its nas4free embedded or full install in order to get email address
- if [ -f /etc/platform ] && [ "$platform" == "FreeBSD" ]; then
- if [[ $(grep embedded /etc/platform) ]]; then
- config="/cf/conf/config.xml" # For embedded installs
- # Add a test to remount /cf if there is problems
- if [[ ! -f $config ]]; then
- umount /cf && mount /cf
- fi
- else
- config="/conf/config.xml" # For full installs
- fi
- # Email parameters
- if [ -z "$email_to" ] && [ -f $config ]; then
- email_to=$(/usr/local/bin/xml sel -t -v "//smartd/email/to" $config)
- fi
- if [ -z "$email_from" ] && [ -f $config ]; then
- email_from=$(/usr/local/bin/xml sel -t -v "//email/from" $config)
- fi
- fi
- ##############################################################################################################
- ## Check ZFS pools for health, scrub and disk space
- # Get disk space and pool state, only 5 columns are needed
- zpool list -H -o name,free,cap,health | while read output
- do
- ################################
- # get basic stuff like pool name and health
- pool=$(echo $output | awk '{ print $1 }')
- health=$(echo $output | awk '{ print $4 }')
- if [ "$health" == "ONLINE" ]; then
- echo -e "ZFS pool $pool state is \e[32m${health}\e[0m"
- else
- echo -e "ZFS pool $pool state is \e[31m${health}\e[0m"
- fi
- ##################################
- # files for pool errors, health and scrub state
- poolpath="${poolmailfile%/*}"
- poolfilename=$(basename "$poolmailfile")
- poolfilename="${poolfilename%.*}"
- poolext="${poolmailfile##*.}"
- poolmailfilehealth="${poolpath}/${poolfilename}-${pool}-health.${poolext}"
- poolmailfilescrub="${poolpath}/${poolfilename}-${pool}-scrub.${poolext}"
- poolfilescrubstate="${poolpath}/${poolfilename}-${pool}-scrubstate.${poolext}"
- poolfilechksumstate="${poolpath}/${poolfilename}-${pool}-chksumstate.${poolext}"
- # files for disk space warnings
- spacewarnpath="${spacewarncountfile%/*}"
- spacewarnfilename=$(basename "$spacewarncountfile")
- spacewarnfilename="${spacewarnfilename%.*}"
- spacewarnext="${spacewarncountfile##*.}"
- spacewarncountfilenew="${spacewarnpath}/${spacewarnfilename}-${pool}.${spacewarnext}"
- # create if doesnt exist
- if [ ! -f "$spacewarncountfilenew" ]; then
- echo 0 > $spacewarncountfilenew
- fi
- ###################################
- # see if there are chksum errors
- if [ ! -f "$poolfilechksumstate" ]; then
- touch $poolfilechksumstate
- fi
- read -r poolchksumstateold <<< $(cat $poolfilechksumstate)
- zpool status ${pool} | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000 > $poolfilechksumstate
- read -r poolchksumstate <<< $(cat $poolfilechksumstate)
- if [ "${poolchksumstate}" ]; then
- echo "Warning: \e[31m${poolfilechksumstate}\e[0m ZFS Chksum errors found!"
- if [ "${poolchksumstate}" != "${poolchksumstateold}" ]; then
- subject="ZFS pool $pool server $myhostname checksum errors found"
- body="Warning: ZFS pool $pool has $poolchksumerrors checksum errors. Please check your disks. There will be no more automatic scrub on this pool, until this is fixed."
- send_mail
- if [ $? -ne 0 ]; then rm $poolfilechksumstate; fi
- else
- echo "Mail about chksum errors was already sent"
- fi
- else
- echo -e "\e[32m0\e[0m ZFS chksum errors found on pool $pool"
- fi
- #######################################
- # monitor scrub and issue if needed (doesnt run when pool is not online,
- # scrub or resilver is in place, chksum errors were found or it was never run before)
- if [ "$scrubrunhour" ] || [ "$scrubexpire" ]; then
- if [ ! -f "$poolmailfilescrub" ]; then
- echo "0" > $poolmailfilescrub
- fi
- #get pool scrub mail state from last run, if not 0 then mail was sent already
- read -r poolmailscrub <<< $(cat $poolmailfilescrub)
- if [ ! -f "$poolfilescrubstate" ]; then
- touch $poolfilescrubstate
- fi
- #get scrub state from last run
- read -r scrubstateold <<< $(cat $poolfilescrubstate)
- #get actual scrub state
- zpool status $pool | egrep "none requested|resilver|scrub in progress|scrub repaired" > $poolfilescrubstate
- read -r scrubstate <<< $(cat $poolfilescrubstate)
- if [ $(echo $scrubstate | egrep -c "none requested|resilvered") -ge 1 ]; then
- echo "ZFS scrub was not run before on ${pool} or it was resilvered, cannot monitor or issue new scrubs automatically"
- if [ "$health" == "ONLINE" ] && [ "$poolmailscrub" != "1" ]; then #if drive is online but scrub cannot be scheduled
- echo "Cannot schedule scrub"
- subject="ZFS pool $pool server $myhostname cannot schedule scrub"
- body="Warning: ZFS pool $pool is ONLINE but cannot be scrubed automatically, please run scrub manually one time."
- send_mail
- if [ $? -eq 0 ]; then echo "1" > $poolmailfilescrub; fi
- elif [ "$health" == "ONLINE" ] && [ "$poolmailscrub" == "1" ]; then
- echo "Cannot schedule scrub, mail was already sent"
- fi
- elif [ "$health" == "ONLINE" ] && [ "$poolmailscrub" != "0" ]; then
- echo "ZFS scrub check is working now"
- subject="ZFS pool $pool on server $myhostname scrub check is ok now"
- body="Notice: ZFS pool $pool will be scrubed automatically from now on"
- send_mail
- echo "0" > $poolmailfilescrub # reset counter
- elif [ $(echo $scrubstate | egrep -c "scrub repaired 0") -ge 1 ] && [ "$scrubstate" != "$scrubstateold" ] && [ "$scrubstateold" ]; then
- echo "ZFS scrub finished on ${pool}"
- subject="ZFS pool $pool on server $myhostname scrub finished"
- body="Scrub finished on pool $myhostname with result $scrubstate"
- send_mail
- elif [ $(echo $scrubstate | egrep -c "scrub repaired") -ge 1 ] && [ "$scrubstate" != "$scrubstateold" ] && [ "$scrubstateold" ]; then
- echo "ZFS scrub finished on ${pool} with errors"
- subject="ZFS pool $pool on server $myhostname scrub finished with errors"
- body="Scrub finished on pool $myhostname with errors. There will be no subsequent scrub until error is cleared out. Result: $scrubstate"
- send_mail
- elif [ $(echo $scrubstate | egrep -c "scrub in progress|resilver") -ge 1 ]; then
- echo "ZFS scrub or resilver is already in progress on ${pool}, cannot schedule scrub"
- elif [ "$health" != "ONLINE" ]; then
- echo "Cannot schedule scrub, because ZFS pool $pool is not online."
- elif [ "${poolchksumerrors}" ]; then
- echo "Cannot scrub, because ZFS pool $pool has chksum errors."
- elif [ $(echo $scrubstate | egrep -c "scrub repaired 0") -ge 1 ] && [ "$health" == "ONLINE" ]; then
- #everything ok, scrub can be scheduled
- if [ "$platform" == "FreeBSD" ] || [ "$platform" == "Unix" ]; then
- #scrubrawdate=$(zpool status $pool | grep scrub | awk '{print $15 $12 $13}')
- scrubrawdate=$(echo $scrubstate | grep "scrub repaired" | awk '{print $15 $12 $13}')
- scrubdate=$(date -j -f '%Y%b%e-%H%M%S' $scrubrawdate'-000000' +%s)
- elif [ $platform == "Linux" ]; then
- #scrubrawdate=$(zpool status $pool | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}')
- scrubrawdate=$(echo $scrubstate | grep "scrub repaired" | awk '{print $11" "$12" " $13" " $14" "$15}')
- scrubdate=$(date -d "$scrubrawdate" +%s)
- fi
- if [ $(($currentscrubdate - $scrubdate)) -ge $scrubexpire ] && [ ! -z "$scrubdate" ]; then
- echo "Scrub expired on ZFS pool ${pool}"
- if [ "$(date +%H)" == "$scrubrunhour" ]; then
- subject="ZFS pool $pool on server $myhostname scrub started"
- body="Scrub started on pool $myhostname because it was expired and scheduled for ${scrubrunhour}:00h. Please do not reboot until scrub is finished. The last scrub state was $scrubstate"
- send_mail
- zpool scrub $pool
- else
- echo "ZFS scrub will be scheduled for ${scrubrunhour}:00h"
- fi
- else
- if [ ! -z "$scrubdate" ]; then
- echo "ZFS scrub is not expired yet on pool $pool"
- else
- echo "ZFS scrub on pool $pool cant be checked because unknown OS or cant get date"
- fi
- fi
- else
- echo "Something went wrong with scrub on pool $pool, could not get scrub state"
- fi
- else
- echo "Cannot monitor scrub because scrubrunhour or scrubexpire parameter missing"
- fi
- #########################################
- #for disk space warnings
- if [ "$minfreegig" ]; then
- #get free gig space from zfs list because zpool doesnt show correct value on raidz
- capfreegigfullstring=$(zfs list -H -o avail $pool)
- capfreegig=$(echo $capfreegigfullstring | sed 's/.$//' | tr . ,) #remove last string and replace all dot with comma
- if [ "${capfreegigfullstring: -1}" = "T" ]; then
- capfreegig=$((capfreegig*1024)) #convert T to G, takes only comma as decimal separator
- fi
- capfreegig=$(echo $capfreegig | tr , . | xargs printf "%.*f\n" 0) #round value, beware: takes only dot as decimal separator
- capusedperc=$(echo $output | awk '{ print $3 }' | sed 's/.$//' | xargs printf "%.*f\n" 0)
- if [ "$capusedperc" -gt "$maxusedperc" ] || [ "$capfreegig" -lt "$minfreegig" ]; then #doesnt work with decimal numbers
- #if day from file is not today then we need to reset counter
- if [ $daynow != $dayfromfile ]; then
- echo 0 > $spacewarncountfilenew
- fi
- read -r warncount <<< $(cat $spacewarncountfilenew)
- warncount=$((warncount+1))
- echo $warncount > $spacewarncountfilenew
- if [ "$maxwarncountperday" -ge "$warncount" ]; then
- echo -e "Disk space on ZFS pool $pool is in \e[31mWARNING\e[0m state, will send mail ..."
- subject="Disk space full on server $myhostname pool $pool"
- body="Used capacity $capusedperc percent is greater than $maxusedperc percent threshold or free capacity $capfreegig G is lower than $minfreegig G threshold"
- send_mail
- if [ $? -ne 0 ]; then echo 0 > $spacewarncountfilenew; fi
- else
- echo -e "Disk space on ZFS pool $pool is in \e[31mWARNING\e[0m state, cannot send mail because maxwarncountperday threshold reached"
- fi
- else
- echo -e "Free disk space \e[32mOK\e[0m on on ZFS pool ${pool} - used cap ${capusedperc}% free space ${capfreegig}G"
- fi
- else
- echo "Cannot run disk space check, because minfreegig parameter is missing"
- fi
- ###########################################
- #the mail part for health
- if [ ! -f "$poolmailfilehealth" ]; then
- echo "0" > $poolmailfilehealth
- fi
- #get pool health state from last run, if not 0 then mail was sent already
- read -r poolmailhealth <<< $(cat $poolmailfilehealth)
- if [ "$health" != "ONLINE" ] && [ "$poolmailhealth" == "0" ]; then # if pool is not online and mail not sent yet
- echo "ZFS pool $pool is not healthy, will send mail ..."
- subject="ZFS pool $pool on server $myhostname is not healthy."
- body="Warning: ZFS pool $pool is in $health state. Checksum errors: ${poolchksumerrors}. There will be no more automatic scrubs and no more error mails until this is fixed."
- send_mail
- if [ $? -eq 0 ]; then echo "1" > $poolmailfilehealth; fi
- elif [ "$health" != "ONLINE" ] && [ "$poolmailhealth" != "0" ]; then # if pool is not online and mail was sent already
- echo "ZFS pool $pool is not healthy, mail was already sent"
- elif [ "$health" == "ONLINE" ] && [ "$poolmailhealth" != "0" ]; then #if poolmailhealth contains 1 then mail was already sent
- subject="ZFS pool $pool on server $myhostname health state is ok"
- body="Notice: ZFS pool $pool returned to $health state"
- send_mail
- if [ $? -eq 0 ]; then echo "0" > $poolmailfilehealth; fi #reset counter
- fi
- done
- #####################################################################################################
- ## checking S.M.A.R.T. health and attributes
- #files for storing errors
- smartpath="${smarterrorfile%/*}"
- smartfilename=$(basename "$smarterrorfile")
- smartfilename="${smartfilename%.*}"
- smartext="${smarterrorfile##*.}"
- #get list of disks
- if [ "$platform" == "Linux" ]; then
- harddisks=$(lsblk -l | grep 'sd[a-z][^1-99]' | awk '{ print $1 }')
- elif [ "$platform" == "FreeBSD" ]; then
- harddisks=$(egrep 'da[0-99]' /var/run/dmesg.boot | sed 's/://' | awk '{ print $1 }' | uniq)
- fi
- ##############################
- # iterate through hard disks
- for disk in $harddisks; do
- smarterrordiskfile="${smartpath}/${smartfilename}-${disk}.${smartext}"
- if [ -f $smarterrordiskfile ]; then
- read -r smarterrorsold <<< $(cat $smarterrordiskfile)
- fi
- cat /dev/null > $smarterrordiskfile
- if [ ! -z "${!disk}" ]; then
- smartctl -A -H /dev/${disk} -d ${!disk} | awk '{ print $1,$2,$4,$6,$10 }' > $smartcache
- else
- smartctl -A -H /dev/${disk} | awk '{ print $1,$2,$4,$6,$10 }' > $smartcache
- fi
- cat $smartcache | while read output
- do
- diskhealth=$(echo $output | grep "overall-health" | awk '{ print $4 }')
- id5=$(echo $output | grep "Reallocated_Sector" | awk '{ print $5}')
- id10=$(echo $output | grep "Spin_Retry" | awk '{ print $5}')
- id196=$(echo $output | grep "Reallocated_Event" | awk '{ print $5}')
- id197=$(echo $output | grep "Current_Pending" | awk '{ print $5}')
- id198=$(echo $output | grep "Offline_Uncorrectable" | awk '{ print $5}')
- id199=$(echo $output | grep "UDMA_CRC" | awk '{ print $5}')
- id233=$(echo $output | grep "Media_Wearout" | awk '{ print $3}')
- if [ ! -z "$diskhealth" ] && [ $diskhealth != "PASSED" ]; then
- echo "-------------------------------------------"
- echo -e "S.M.A.R.T health state is \e[31mFAILED\e[0m on disk $disk" > $smarterrordiskfile
- elif [ ! -z "$diskhealth" ] && [ $diskhealth == "PASSED" ]; then
- echo "-------------------------------------------"
- echo -e "S.M.A.R.T. health state is \e[32mOK\e[0m on disk $disk"
- elif [ ! -z "$diskhealth" ]; then
- echo "-------------------------------------------"
- echo -e "S.M.A.R.T. health state is \e[31mUNKNOWN\e[0m on disk $disk"
- fi
- if [ ! -z "$id5" ] && [ "$id5" -gt "0" ]; then
- echo "${disk}: $id5 reallocated sectors" | tee -a $smarterrordiskfile
- elif [ ! -z "$id5" ]; then
- echo "${disk}: 0 reallocated sectors"
- fi
- if [ ! -z "$id10" ] && [ "$id10" -gt "0" ]; then
- echo "${disk}: $id10 spin retry count" | tee -a $smarterrordiskfile
- elif [ ! -z "$id10" ]; then
- echo "${disk}: 0 spin retry count"
- fi
- if [ ! -z "$id196" ] && [ "$id196" -gt "0" ]; then
- echo "${disk}: $id196 reallocation events" | tee -a $smarterrordiskfile
- elif [ ! -z "$id196" ]; then
- echo "${disk}: 0 reallocation events"
- fi
- if [ ! -z "$id197" ] && [ "$id197" -gt "0" ]; then
- echo "${disk}: $id197 pending sectors" | tee -a $smarterrordiskfile
- elif [ ! -z "$id197" ]; then
- echo "${disk}: 0 pending sectors"
- fi
- if [ ! -z "$id198" ] && [ "$id198" -gt "0" ]; then
- echo "${disk}: $id198 offline uncorrectable" | tee -a $smarterrordiskfile
- elif [ ! -z "$id198" ]; then
- echo "${disk}: 0 offline uncorrectable"
- fi
- if [ ! -z "$id199" ] && [ "$id199" -gt "0" ]; then
- echo "${disk}: $id199 UDMA CRC error" | tee -a $smarterrordiskfile
- elif [ ! -z "$id199" ]; then
- echo "${disk}: 0 UDMA CRC error"
- fi
- if [ ! -z "$id202" ] && [ "$id202" -le "5" ]; then
- echo "${disk}: $id202 percent lifetime problem" | tee -a $smarterrordiskfile
- elif [ ! -z "$id202" ]; then
- echo "${disk}: percent lifetime is ${id233}"
- fi
- if [ ! -z "$id233" ] && [ "$id233" -le "5" ]; then
- echo "${disk}: $id233 media wearout problem" | tee -a $smarterrordiskfile
- elif [ ! -z "$id233" ]; then
- echo "${disk}: media wearout is ${id233}"
- fi
- done
- read -r smarterrors <<< $(cat $smarterrordiskfile) #get new values from file
- if [ "$smarterrors" ] && [ "$smarterrors" != "$smarterrorsold" ]; then
- echo -e "S.M.A.R.T. attributes \e[31mFAILURE\e[0m on disk $disk"
- subject="Disk $disk on server $myhostname has S.M.A.R.T. attribute errors"
- body="Disk $disk has attribute errors ${smarterrors}. There will be no more mail until status will change."
- send_mail
- if [ $? -ne 0 ]; then rm $smarterrordiskfile; fi
- elif [ ! "$smarterrors" ] && [ "$smarterrorsold" ]; then
- echo -e "S.M.A.R.T. attributes \e[32mRECOVERED\e[0m on disk $disk"
- subject="Disk $disk on server $myhostname S.M.A.R.T. recovered from errors"
- body="Disk $disk has no more attribute errors."
- send_mail
- if [ $? -ne 0 ]; then rm $smarterrordiskfile; fi
- elif [ ! "$smarterrors" ]; then
- echo -e "S.M.A.R.T. attributes \e[32mOK\e[0m on disk $disk"
- elif [ "$smarterrors" ]; then
- echo -e "S.M.A.R.T. attributes \e[31mFAILURE\e[0m on disk ${disk}, mail was already sent"
- fi
- done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement