Untitled

#!/bin/bash
# 2015-2016 by ghost75 v1.61
# Tested on Nas4free and Linux.
maxusedperc="80"
minfreegig="20" #only Gigabytes and digits allowed
maxwarncountperday="1" #how many disk space warnings per day? 0 to disable space warning mail
scrubexpire="1728000" #after how many seconds scrub will expire: 7d x 24h x 3600s = 604800
scrubrunhour="22" #run scrub off peak time, 24h time format
spacewarncountfile="/tmp/spacewarn_count.log" #to get track when to send mail
dayfile="/tmp/spacewarn_day.log" #to store actual day
poolmailfile="/tmp/pool.log" #log related to zfs pool
smarterrorfile="/tmp/smarterrors.log" #log includes smart attribute and unhealthy state if disk has error
smartcache="/tmp/smartcache.log" #stores output of smartctl

#smartctl device options
sdc="sat"

#email address needs only to be set if you dont want to use the values from
#Nas4Free WebUI (System|Advanced|Email and Disk|Management|Smart or Status|Email Report)
#of if you have non NAS4Free system
email_to="root" #required on non Nas4free
#email_from=""

############################################################

PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
source $HOME/.profile

send_mail() {
if [ "$platform" == "FreeBSD" ] && [ ! -z $config ]; then
    printf "From:$email_from\nTo:$email_to\nSubject:$subject\n\n$body" | /usr/local/bin/msmtp --file=/var/etc/msmtp.conf -t
elif [ "$platform" == "Unix" ] || [ "$platform" == "Linux" ] || [ "$platform" == "FreeBSD" ]; then
    if [ ! -z "$email_to" ]; then
        mail -s "$subject" "$email_to" <<< "$body"
    else
        echo "Cannot send mail because parameter email_to is missing, will try to send again next time" >&2
    fi
fi
}

run_help() {
echo -e "\n Monitor your ZFS pools\n"
echo " Script will monitor disk space and send email if thresholds are reached,"
echo " it will send mail only a specific amount of times per day, which can be set"
echo " by parameter."
echo " Also there will be an email for any zfs pools which are not in online state"
echo " (i.e. degraded) or those with chksum errors. There will be an email once for"
echo " every new error after the old error is fixed."
echo " Scrub will be also taken place on a regular basis (see scrubexpire parameter)"
echo " if there are no chksum errors, scrub errors from last run and if pool is ONLINE"
echo " and not in resilver state."
echo " S.M.A.R.T. attributes id5,10,196,197,198 will be monitored for any raw value > 0."
echo " There will be email for every new disk error but no reoccuring mail if the"
echo -e " error stays the same. If the script cannot send mail, it will try next time.\n"
echo "   The following parameters are supported:"
echo "    -e scrubexpire, how many seconds there will be no new scrub"
echo "    -f email_from"
echo "    -g minfreegig, for disk space monitoring"
echo "    -p maxusedperc, for disk space monitoring"
echo "    -r scrubrunhour, in 24h format"
echo -e "    -t email_to \n"
echo " It can be scheduled with cron to run like every 5min i.e.:"
echo " */5 * * * * root /usr/local/sbin/zfshealth"
echo -e " You may need also to set PATH variable in cron file.\n"

exit 0
}

#get parameters if specified
while getopts e:f:g:hp:r:t: option; do
        case "${option}" in
                e) scrubexpire=${OPTARG};;
                f) email_from=${OPTARG};;
                g) minfreegig=${OPTARG};;
                h) run_help;;
                p) maxusedperc=${OPTARG};;
                r) scrubrunhour=${OPTARG};;
                t) email_to=${OPTARG};;
        esac
done

##which platform are we?
if [ "$(uname)" == "Linux" ]; then
    platform="Linux"
    myhostname=$(hostname)
elif [ "$(uname)" == "FreeBSD" ]; then
    platform="FreeBSD"
    myhostname=$HOST
elif [ "$(uname)" == "Unix" ]; then
    platform="Unix"
    myhostname=$HOST
else
    platform="unknown"
    myhostname="unknown"
    echo "OS is not supported, exit here"
    exit 1
fi

###########################

# Checking days (for limiting space warning mails)

daynow=$(date +"%d")

#if file doesnt exist, put date there
if [ ! -f "$dayfile" ]; then
        echo $daynow > $dayfile
fi

#get day from file to know when the last runtime was
read -r dayfromfile <<< $(cat $dayfile)
#write actual day into file
echo $daynow > $dayfile

#######################

#this is used for scrub expire
currentscrubdate=$(date +"%s")

#######################

# Find out if its nas4free embedded or full install in order to get email address
if [ -f /etc/platform ] && [ "$platform" == "FreeBSD" ]; then
    if [[ $(grep embedded /etc/platform) ]]; then
        config="/cf/conf/config.xml"  # For embedded installs
        # Add a test to remount /cf if there is problems
        if [[ ! -f $config ]]; then
        umount /cf && mount /cf
        fi

    else
        config="/conf/config.xml"  # For full installs
    fi

    # Email parameters
    if [ -z "$email_to" ] && [ -f $config ]; then
        email_to=$(/usr/local/bin/xml sel -t -v "//smartd/email/to" $config)
    fi
    if [ -z "$email_from" ] && [ -f $config ]; then
        email_from=$(/usr/local/bin/xml sel -t -v "//email/from" $config)
    fi
fi

##############################################################################################################
## Check ZFS pools for health, scrub and disk space

# Get disk space and pool state, only 5 columns are needed
zpool list -H -o name,free,cap,health | while read output
do
        ################################
        # get basic stuff like pool name and health
        pool=$(echo $output | awk '{ print $1 }')
        health=$(echo $output | awk '{ print $4 }')
        if [ "$health" == "ONLINE" ]; then
        echo -e "ZFS pool $pool state is \e[32m${health}\e[0m"
        else
        echo -e "ZFS pool $pool state is \e[31m${health}\e[0m"
        fi

        ##################################
        # files for pool errors, health and scrub state
        poolpath="${poolmailfile%/*}"
        poolfilename=$(basename "$poolmailfile")
        poolfilename="${poolfilename%.*}"
        poolext="${poolmailfile##*.}"
        poolmailfilehealth="${poolpath}/${poolfilename}-${pool}-health.${poolext}"
        poolmailfilescrub="${poolpath}/${poolfilename}-${pool}-scrub.${poolext}"
        poolfilescrubstate="${poolpath}/${poolfilename}-${pool}-scrubstate.${poolext}"
        poolfilechksumstate="${poolpath}/${poolfilename}-${pool}-chksumstate.${poolext}"
        # files for disk space warnings
        spacewarnpath="${spacewarncountfile%/*}"
        spacewarnfilename=$(basename "$spacewarncountfile")
        spacewarnfilename="${spacewarnfilename%.*}"
        spacewarnext="${spacewarncountfile##*.}"
        spacewarncountfilenew="${spacewarnpath}/${spacewarnfilename}-${pool}.${spacewarnext}"

        # create if doesnt exist
        if [ ! -f "$spacewarncountfilenew" ]; then
            echo 0 > $spacewarncountfilenew
        fi

        ###################################
        # see if there are chksum errors
        if [ ! -f "$poolfilechksumstate" ]; then
            touch $poolfilechksumstate
        fi
        read -r poolchksumstateold <<< $(cat $poolfilechksumstate)
        zpool status ${pool} | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000 > $poolfilechksumstate
        read -r poolchksumstate <<< $(cat $poolfilechksumstate)
        if [ "${poolchksumstate}" ]; then
            echo "Warning: \e[31m${poolfilechksumstate}\e[0m ZFS Chksum errors found!"
            if [ "${poolchksumstate}" != "${poolchksumstateold}" ]; then
                subject="ZFS pool $pool server $myhostname checksum errors found"
                body="Warning: ZFS pool $pool has $poolchksumerrors checksum errors. Please check your disks. There will be no more automatic scrub on this pool, until this is fixed."
                send_mail
                if [ $? -ne 0 ]; then rm $poolfilechksumstate; fi
            else
                echo "Mail about chksum errors was already sent"
            fi
        else
            echo -e "\e[32m0\e[0m ZFS chksum errors found on pool $pool"
        fi

        #######################################
        # monitor scrub and issue if needed (doesnt run when pool is not online,
        # scrub or resilver is in place, chksum errors were found or it was never run before)
        if [ "$scrubrunhour" ] || [ "$scrubexpire" ]; then

        if [ ! -f "$poolmailfilescrub" ]; then
            echo "0" > $poolmailfilescrub
        fi
        #get pool scrub mail state from last run, if not 0 then mail was sent already
        read -r poolmailscrub <<< $(cat $poolmailfilescrub)

        if [ ! -f "$poolfilescrubstate" ]; then
            touch $poolfilescrubstate
        fi

        #get scrub state from last run
        read -r scrubstateold <<< $(cat $poolfilescrubstate)
        #get actual scrub state
        zpool status $pool | egrep "none requested|resilver|scrub in progress|scrub repaired" > $poolfilescrubstate
        read -r scrubstate <<< $(cat $poolfilescrubstate)

        if [ $(echo $scrubstate | egrep -c "none requested|resilvered") -ge 1 ]; then
            echo "ZFS scrub was not run before on ${pool} or it was resilvered, cannot monitor or issue new scrubs automatically"
            if [ "$health" == "ONLINE" ] && [ "$poolmailscrub" != "1" ]; then #if drive is online but scrub cannot be scheduled
                echo "Cannot schedule scrub"
                subject="ZFS pool $pool server $myhostname cannot schedule scrub"
                body="Warning: ZFS pool $pool is ONLINE but cannot be scrubed automatically, please run scrub manually one time."
                send_mail
                if [ $? -eq 0 ]; then echo "1" > $poolmailfilescrub; fi
            elif [ "$health" == "ONLINE" ] && [ "$poolmailscrub" == "1" ]; then
                echo "Cannot schedule scrub, mail was already sent"
            fi
        elif [ "$health" == "ONLINE" ] && [ "$poolmailscrub" != "0" ]; then
            echo "ZFS scrub check is working now"
            subject="ZFS pool $pool on server $myhostname scrub check is ok now"
            body="Notice: ZFS pool $pool will be scrubed automatically from now on"
            send_mail
            echo "0" > $poolmailfilescrub # reset counter
        elif [ $(echo $scrubstate | egrep -c "scrub repaired 0") -ge 1 ] && [ "$scrubstate" != "$scrubstateold" ] && [ "$scrubstateold" ]; then
            echo "ZFS scrub finished on ${pool}"
            subject="ZFS pool $pool on server $myhostname scrub finished"
            body="Scrub finished on pool $myhostname with result $scrubstate"
            send_mail
        elif [ $(echo $scrubstate | egrep -c "scrub repaired") -ge 1 ] && [ "$scrubstate" != "$scrubstateold" ] && [ "$scrubstateold" ]; then
            echo "ZFS scrub finished on ${pool} with errors"
            subject="ZFS pool $pool on server $myhostname scrub finished with errors"
            body="Scrub finished on pool $myhostname with errors. There will be no subsequent scrub until error is cleared out. Result: $scrubstate"
            send_mail
        elif [ $(echo $scrubstate | egrep -c "scrub in progress|resilver") -ge 1 ]; then
            echo "ZFS scrub or resilver is already in progress on ${pool}, cannot schedule scrub"
        elif [ "$health" != "ONLINE" ]; then
            echo "Cannot schedule scrub, because ZFS pool $pool is not online."
        elif [ "${poolchksumerrors}" ]; then
            echo "Cannot scrub, because ZFS pool $pool has chksum errors."
        elif [ $(echo $scrubstate | egrep -c "scrub repaired 0") -ge 1 ] && [ "$health" == "ONLINE" ]; then
            #everything ok, scrub can be scheduled
            if [ "$platform" == "FreeBSD" ] || [ "$platform" == "Unix" ]; then
                #scrubrawdate=$(zpool status $pool | grep scrub | awk '{print $15 $12 $13}')
                scrubrawdate=$(echo $scrubstate | grep "scrub repaired" | awk '{print $15 $12 $13}')
                scrubdate=$(date -j -f '%Y%b%e-%H%M%S' $scrubrawdate'-000000' +%s)
            elif [ $platform == "Linux" ]; then
                #scrubrawdate=$(zpool status $pool | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}')
                scrubrawdate=$(echo $scrubstate | grep "scrub repaired" | awk '{print $11" "$12" " $13" " $14" "$15}')
                scrubdate=$(date -d "$scrubrawdate" +%s)
            fi

            if [ $(($currentscrubdate - $scrubdate)) -ge $scrubexpire ] && [ ! -z "$scrubdate" ]; then
                echo "Scrub expired on ZFS pool ${pool}"
                if [ "$(date +%H)" == "$scrubrunhour" ]; then
                    subject="ZFS pool $pool on server $myhostname scrub started"
                    body="Scrub started on pool $myhostname because it was expired and scheduled for ${scrubrunhour}:00h. Please do not reboot until scrub is finished. The last scrub state was $scrubstate"
                    send_mail
                    zpool scrub $pool
                else
                    echo "ZFS scrub will be scheduled for ${scrubrunhour}:00h"
                fi
            else
                if [ ! -z "$scrubdate" ]; then
                    echo "ZFS scrub is not expired yet on pool $pool"
                else
                    echo "ZFS scrub on pool $pool cant be checked because unknown OS or cant get date"
                fi
            fi
        else
            echo "Something went wrong with scrub on pool $pool, could not get scrub state"
        fi

        else
            echo "Cannot monitor scrub because scrubrunhour or scrubexpire parameter missing"
        fi

        #########################################
        #for disk space warnings

        if [ "$minfreegig" ]; then

            #get free gig space from zfs list because zpool doesnt show correct value on raidz
            capfreegigfullstring=$(zfs list -H -o avail $pool)
            capfreegig=$(echo $capfreegigfullstring | sed 's/.$//' | tr . ,) #remove last string and replace all dot with comma
            if [ "${capfreegigfullstring: -1}" = "T" ]; then
                capfreegig=$((capfreegig*1024)) #convert T to G, takes only comma as decimal separator
            fi
            capfreegig=$(echo $capfreegig | tr , . | xargs printf "%.*f\n" 0) #round value, beware: takes only dot as decimal separator
            capusedperc=$(echo $output | awk '{ print $3 }' | sed 's/.$//' | xargs printf "%.*f\n" 0)

            if [ "$capusedperc" -gt "$maxusedperc" ] || [ "$capfreegig" -lt "$minfreegig" ]; then #doesnt work with decimal numbers

                #if day from file is not today then we need to reset counter
                if [ $daynow != $dayfromfile ]; then
                    echo 0 > $spacewarncountfilenew
                fi

                read -r warncount <<< $(cat $spacewarncountfilenew)
                warncount=$((warncount+1))
                echo $warncount > $spacewarncountfilenew

                if [ "$maxwarncountperday" -ge "$warncount" ]; then
                    echo -e "Disk space on ZFS pool $pool is in \e[31mWARNING\e[0m state, will send mail ..."
                    subject="Disk space full on server $myhostname pool $pool"
                    body="Used capacity $capusedperc percent is greater than $maxusedperc percent threshold or free capacity $capfreegig G is lower than $minfreegig G threshold"
                    send_mail
                    if [ $? -ne 0 ]; then echo 0 > $spacewarncountfilenew; fi
                else
                    echo -e "Disk space on ZFS pool $pool is in \e[31mWARNING\e[0m state, cannot send mail because maxwarncountperday threshold reached"
                fi

            else
                echo -e "Free disk space \e[32mOK\e[0m on on ZFS pool ${pool} - used cap ${capusedperc}% free space ${capfreegig}G"
            fi

        else
            echo "Cannot run disk space check, because minfreegig parameter is missing"
        fi

        ###########################################
        #the mail part for health

        if [ ! -f "$poolmailfilehealth" ]; then
            echo "0" > $poolmailfilehealth
        fi

        #get pool health state from last run, if not 0 then mail was sent already
        read -r poolmailhealth <<< $(cat $poolmailfilehealth)

        if [ "$health" != "ONLINE" ] && [ "$poolmailhealth" == "0" ]; then # if pool is not online and mail not sent yet
            echo "ZFS pool $pool is not healthy, will send mail ..."
            subject="ZFS pool $pool on server $myhostname is not healthy."
            body="Warning: ZFS pool $pool is in $health state. Checksum errors: ${poolchksumerrors}. There will be no more automatic scrubs and no more error mails until this is fixed."
            send_mail
            if [ $? -eq 0 ]; then echo "1" > $poolmailfilehealth; fi
        elif [ "$health" != "ONLINE" ] && [ "$poolmailhealth" != "0" ]; then # if pool is not online and mail was sent already
            echo "ZFS pool $pool is not healthy, mail was already sent"
        elif [ "$health" == "ONLINE" ] && [ "$poolmailhealth" != "0" ]; then #if poolmailhealth contains 1 then mail was already sent
            subject="ZFS pool $pool on server $myhostname health state is ok"
            body="Notice: ZFS pool $pool returned to $health state"
            send_mail
            if [ $? -eq 0 ]; then echo "0" > $poolmailfilehealth; fi #reset counter
        fi


done


#####################################################################################################
## checking S.M.A.R.T. health and attributes

#files for storing errors
smartpath="${smarterrorfile%/*}"
smartfilename=$(basename "$smarterrorfile")
smartfilename="${smartfilename%.*}"
smartext="${smarterrorfile##*.}"

#get list of disks
if [ "$platform" == "Linux" ]; then
    harddisks=$(lsblk -l | grep 'sd[a-z][^1-99]' | awk '{ print $1 }')
elif [ "$platform" == "FreeBSD" ]; then
    harddisks=$(egrep 'da[0-99]' /var/run/dmesg.boot | sed 's/://' | awk '{ print $1 }' | uniq)
fi

##############################
# iterate through hard disks

for disk in $harddisks; do
    smarterrordiskfile="${smartpath}/${smartfilename}-${disk}.${smartext}"
    if [ -f $smarterrordiskfile ]; then
        read -r smarterrorsold <<< $(cat $smarterrordiskfile)
    fi
    cat /dev/null > $smarterrordiskfile
    if [ ! -z "${!disk}" ]; then
        smartctl -A -H /dev/${disk} -d ${!disk} | awk '{ print $1,$2,$4,$6,$10 }' > $smartcache
    else
        smartctl -A -H /dev/${disk} | awk '{ print $1,$2,$4,$6,$10 }' > $smartcache
    fi
    cat $smartcache | while read output
    do
    diskhealth=$(echo $output | grep "overall-health" | awk '{ print $4 }')
    id5=$(echo $output | grep "Reallocated_Sector" | awk '{ print $5}')
    id10=$(echo $output | grep "Spin_Retry" | awk '{ print $5}')
    id196=$(echo $output | grep "Reallocated_Event" | awk '{ print $5}')
    id197=$(echo $output | grep "Current_Pending" | awk '{ print $5}')
    id198=$(echo $output | grep "Offline_Uncorrectable" | awk '{ print $5}')
    id199=$(echo $output | grep "UDMA_CRC" | awk '{ print $5}')
    id233=$(echo $output | grep "Media_Wearout" | awk '{ print $3}')
    if [ ! -z "$diskhealth" ] && [ $diskhealth != "PASSED" ]; then
        echo "-------------------------------------------"
        echo -e "S.M.A.R.T health state is \e[31mFAILED\e[0m on disk $disk" > $smarterrordiskfile
    elif [ ! -z "$diskhealth" ] && [ $diskhealth == "PASSED" ]; then
        echo "-------------------------------------------"
        echo -e "S.M.A.R.T. health state is \e[32mOK\e[0m on disk $disk"
    elif [ ! -z "$diskhealth" ]; then
        echo "-------------------------------------------"
        echo -e "S.M.A.R.T. health state is \e[31mUNKNOWN\e[0m on disk $disk"
    fi

    if [ ! -z "$id5" ] && [ "$id5" -gt "0" ]; then
        echo "${disk}: $id5 reallocated sectors" | tee -a $smarterrordiskfile
    elif [ ! -z "$id5" ]; then
        echo "${disk}: 0 reallocated sectors"
    fi
    if [ ! -z "$id10" ] && [ "$id10" -gt "0" ]; then
        echo "${disk}: $id10 spin retry count" | tee -a $smarterrordiskfile
    elif [ ! -z "$id10" ]; then
        echo "${disk}: 0 spin retry count"
    fi
    if [ ! -z "$id196" ] && [ "$id196" -gt "0" ]; then
        echo "${disk}: $id196 reallocation events" | tee -a $smarterrordiskfile
    elif [ ! -z "$id196" ]; then
        echo "${disk}: 0 reallocation events"
    fi
    if [ ! -z "$id197" ] && [ "$id197" -gt "0" ]; then
        echo "${disk}: $id197 pending sectors" | tee -a $smarterrordiskfile
    elif [ ! -z "$id197" ]; then
        echo "${disk}: 0 pending sectors"
    fi
    if [ ! -z "$id198" ] && [ "$id198" -gt "0" ]; then
        echo "${disk}: $id198 offline uncorrectable" | tee -a $smarterrordiskfile
    elif [ ! -z "$id198" ]; then
        echo "${disk}: 0 offline uncorrectable"
    fi
    if [ ! -z "$id199" ] && [ "$id199" -gt "0" ]; then
        echo "${disk}: $id199 UDMA CRC error" | tee -a $smarterrordiskfile
    elif [ ! -z "$id199" ]; then
        echo "${disk}: 0 UDMA CRC error"
    fi
    if [ ! -z "$id202" ] && [ "$id202" -le "5" ]; then
        echo "${disk}: $id202 percent lifetime problem" | tee -a $smarterrordiskfile
    elif [ ! -z "$id202" ]; then
        echo "${disk}: percent lifetime is ${id233}"
    fi
    if [ ! -z "$id233" ] && [ "$id233" -le "5" ]; then
        echo "${disk}: $id233 media wearout problem" | tee -a $smarterrordiskfile
    elif [ ! -z "$id233" ]; then
        echo "${disk}: media wearout is ${id233}"
    fi

    done

    read -r smarterrors <<< $(cat $smarterrordiskfile) #get new values from file

    if [ "$smarterrors" ] && [ "$smarterrors" != "$smarterrorsold" ]; then
        echo -e "S.M.A.R.T. attributes \e[31mFAILURE\e[0m on disk $disk"
        subject="Disk $disk on server $myhostname has S.M.A.R.T. attribute errors"
        body="Disk $disk has attribute errors ${smarterrors}. There will be no more mail until status will change."
        send_mail
        if [ $? -ne 0 ]; then rm $smarterrordiskfile; fi
    elif [ ! "$smarterrors" ] && [ "$smarterrorsold" ]; then
        echo -e "S.M.A.R.T. attributes \e[32mRECOVERED\e[0m on disk $disk"
        subject="Disk $disk on server $myhostname S.M.A.R.T. recovered from errors"
        body="Disk $disk has no more attribute errors."
        send_mail
        if [ $? -ne 0 ]; then rm $smarterrordiskfile; fi
    elif [ ! "$smarterrors" ]; then
        echo -e "S.M.A.R.T. attributes \e[32mOK\e[0m on disk $disk"
    elif [ "$smarterrors" ]; then
        echo -e "S.M.A.R.T. attributes \e[31mFAILURE\e[0m on disk ${disk}, mail was already sent"
    fi

done