
#!/bin/ksh
# WRoelofs 17-4-2007: 0.5 Adapted for use with sudo 
# WRoelofs 26-4-2007: 0.6 Added exceptions on volume level (svvolume_exceptions.txt) 
#set -x
# check snapvault lag and collect statistics
#
# Collect stats for the following conditions
# snapvault succeeded: status idle  + lag <= ERROR_LAG
# snapvault failed: status idle + lag > ERROR_LAG
# snapvault not ready: status !idle + poll time > treshold backup window
# archive snapvault status output in file
#
# tested on NetApp Release 7.2: Mon Jul 31 16:36:02 PDT 2006
PATH=$PATH:/appl/tsu/bin
ERROR_LAG=24 # polling time in crontab should be 09:00 
RECIPIENTS="wim.roelofs@atosorigin.com"
SNAPVAULT=/tmp/snapcheck.$$
SNAPMAIL=/tmp/snapmail.txt 
FAILED=/tmp/snapfailed.txt
SNAPSTATS=/users/nldsm01/scripts/data/daily_pbs_summary.csw # output file for stats
ARCHIVE=/users/nldsm01/scripts/data/snapvault_history.log # output file for snapvault status hist 
QTREESKIP=/users/nldsm01/scripts/etc/svqtree_exceptions.txt # qtree's kept outside statistics   
VOLUMESKIP=/users/nldsm01/scripts/etc/svvolume_exceptions.txt # volumes kept outside statistics
ERRORS=/users/nldsm01/scripts/etc/snapv_error_strings # indicate cause not in control by ATOS
FILERS="hwnaf01"
#
#
# Collect actual snapvault status 

for filer in $FILERS
do
  sudo rsh -n ${filer} snapvault status|grep ':' >> ${SNAPVAULT} 
done

rm -f ${SNAPMAIL} > /dev/null 2>&1 
rm -f ${FAILED} > /dev/null 2>&1 

while read line
do

  # get lag in hours
  lag=`echo $line |awk '{print $4}'`
  hour=`echo $lag|cut -d : -f1`

  # remove leading 0 in two digit field
  zero=`echo $hour|cut -c1`
  if [ "$zero" = "0" ]; then
    hour=`echo $hour|cut -c2`
  fi

  if [ $hour -gt $ERROR_LAG ]; then
     dest=`echo $line| awk '{print $2}'`
     filer=`echo $dest | awk -F: '{print $1}'`
     error=`sudo rsh -n ${filer} snapvault status -l $dest | grep '^Current Transfer Error:'` 
     echo "$line"| awk '{print $2,$3,$4,$5,n}' n="$error" >> ${SNAPMAIL} 
  fi

done < ${SNAPVAULT} 

# archive copy snapvault status 
echo "PBS archive: `date`" >> ${ARCHIVE}
cat ${SNAPVAULT} >> ${ARCHIVE}

# report snapvaults with lag > ${ERROR_LAG}  
if [ -f ${SNAPMAIL} ]; then
  mailx -s "Snapvault treshold exceeded" ${RECIPIENTS} < ${SNAPMAIL} 
fi

TOTAL_SNAPVAULTS=`cat ${SNAPVAULT} | grep ec_vault |wc -l|tr -d ' '`

touch ${FAILED}

# filter exceptions  
while read line
do
 qtree=`echo $line |awk '{print $1}'`
 filer=`echo $qtree |awk -F: '{print $1}'`
 volume=`echo $qtree |awk -F/ '{print $3}'` 
 volume="${filer}:/vol/${volume}"
 error=`echo $line |awk -F: '{print $5}'` # field 5 should be error string
 error=`echo $error | sed 's/^ //g'` # remove leading space
 grep $volume ${VOLUMESKIP} # exception volume ? 
 if [ $? -eq 1 ]; then 
   grep $qtree ${QTREESKIP} # exception qtree ?
   if [ $? -eq 1 ]; then 
     grep "$error" ${ERRORS} # check if error cause is in ATOS controlled domain 
     if [ $? -eq 1 ]; then echo $line >> ${FAILED}; fi  
   fi
 fi
done < ${SNAPMAIL}

# count failed snapvaults without exceptions 
FAILED_SNAPVAULTS=`grep Idle ${FAILED} |wc -l | tr -d ' '`

# Count snapvaults not ready 
RUNNING_SNAPVAULTS=`grep -v Idle ${FAILED} |wc -l | tr -d ' '` 

# date,total snapvaulted qtrees,total failed snapvaults,total snapvaults not ready
echo "`date +%m/%d/%y`,${TOTAL_SNAPVAULTS},${FAILED_SNAPVAULTS},${RUNNING_SNAPVAULTS}" >> ${SNAPSTATS}

rm -f ${SNAPVAULT} > /dev/null 2>&1

