
#!/bin/ksh
# WRoelofs 17-4-2007: 0.5 Adapted for use with sudo 
# WRoelofs 26-4-2007: 0.6 Added exceptions on volume level (svvolume_exceptions.txt) 
# WRoelofs 21-5-2007: 1.0 Added calcuations for %success, %ready written to SNAPSTATS 
# AHilgersom 6-12-2007: 1.1 Adapted for use with ssh
#set -x
# check snapvault lag and collect statistics
#
# Collect stats for the following conditions
# snapvault succeeded: status idle  + lag <= ERROR_LAG
# snapvault failed: status idle + lag > ERROR_LAG
# snapvault not ready: status !idle + poll time > treshold backup window
# archive snapvault status output in file
#
# tested on NetApp Release 7.2: Mon Jul 31 16:36:02 PDT 2006
PATH=$PATH:/appl/tsu/bin
ERROR_LAG=24 # polling time in crontab should be 09:00 
RECIPIENTS="gmnl-msscentral@atosorigin.com"
SNAPVAULT=/tmp/snapcheck.$$
SNAPMAIL=/tmp/snapmail.txt 
FAILED=/tmp/snapfailed.txt
SNAPSTATS=/users/nldsm01/scripts/data/daily_pbs_summary.csw # output file for stats
ARCHIVE=/users/nldsm01/scripts/data/snapvault_history.log # output file for snapvault status hist 
QTREESKIP=/users/nldsm01/scripts/etc/svqtree_exceptions.txt # qtree's kept outside statistics   
VOLUMESKIP=/users/nldsm01/scripts/etc/svvolume_exceptions.txt # volumes kept outside statistics
ERRORS=/users/nldsm01/scripts/etc/snapv_error_strings # indicate cause not in control by ATOS
FILERS="hwnaf01"
#
# KPI counters
typeset -i TOTAL_SNAPVAULTS=0
typeset -i FAILED_SNAPVAULTS=0
typeset -i RUNNING_SNAPVAULTS=0
#
# Collect actual snapvault status 

for filer in $FILERS
do
  ssh -n ${filer} snapvault status|grep 'Snapvaulted' >> ${SNAPVAULT} 
done

rm -f ${SNAPMAIL} > /dev/null 2>&1 
rm -f ${FAILED} > /dev/null 2>&1 

while read line
do

  # get lag in hours
  lag=`echo $line |awk '{print $4}'`
  hour=`echo $lag|cut -d : -f1`

  # remove leading 0 in two digit field
  zero=`echo $hour|cut -c1`
  if [ "$zero" = "0" ]; then
    hour=`echo $hour|cut -c2`
  fi

  if [ $hour -gt $ERROR_LAG ]; then
     dest=`echo $line| awk '{print $2}'`
     filer=`echo $dest | awk -F: '{print $1}'`
     error=`ssh -n ${filer} snapvault status -l $dest | grep '^Current Transfer Error:'` 
     echo "$line"| awk '{print $2,$3,$4,$5,n}' n="$error" >> ${SNAPMAIL} 
  fi

done < ${SNAPVAULT} 

# archive copy snapvault status 
echo "PBS archive: `date`" >> ${ARCHIVE}
cat ${SNAPVAULT} >> ${ARCHIVE}

TOTAL_SNAPVAULTS=`cat ${SNAPVAULT} | grep ec_vault |wc -l|tr -d ' '`

touch ${FAILED}

# filter exceptions  
while read line
do
 qtree=`echo $line |awk '{print $1}'`
 filer=`echo $qtree |awk -F: '{print $1}'`
 volume=`echo $qtree |awk -F/ '{print $3}'` 
 volume="${filer}:/vol/${volume}"
 error=`echo $line |awk -F: '{print $5}'` # field 5 should be error string
 error=`echo $error | sed 's/^ //g'` # remove leading space
 grep $volume ${VOLUMESKIP} # exception volume ? 
 if [ $? -eq 1 ]; then 
   grep $qtree ${QTREESKIP} # exception qtree ?
   if [ $? -eq 1 ]; then 
     grep "$error" ${ERRORS} # check if error cause is in ATOS controlled domain 
     if [ $? -eq 1 ]; then echo $line >> ${FAILED}; fi  
   fi
 fi
done < ${SNAPMAIL}

# count failed snapvaults without exceptions 
FAILED_SNAPVAULTS=`grep Idle ${FAILED} |wc -l | tr -d ' '`

# Count snapvaults not ready 
RUNNING_SNAPVAULTS=`grep -v Idle ${FAILED} |wc -l | tr -d ' '` 

# Calculate %success
if [ $FAILED_SNAPVAULTS -gt 0 ]; then
  PCT_SUCCESS=`echo "scale=2; 100-(100/($TOTAL_SNAPVAULTS/$FAILED_SNAPVAULTS))"|bc`
else
  PCT_SUCCESS=100
fi

# Calculate %ready
if [ $RUNNING_SNAPVAULTS -gt 0 ]; then
   PCT_READY=`echo "scale=2; 100-(100/($TOTAL_SNAPVAULTS/$RUNNING_SNAPVAULTS))"|bc`
else
   PCT_READY=100
fi

# date,total snapvaulted qtrees,total failed snapvaults,total snapvaults not ready,%success,%ready
echo "`date +%Y-%m-%d`,${TOTAL_SNAPVAULTS},${FAILED_SNAPVAULTS},${RUNNING_SNAPVAULTS},\
${PCT_SUCCESS},${PCT_READY}" >> ${SNAPSTATS}

# email for control purposes
if [ -f ${FAILED} ]; then
  mailx -s ":`hostname`: $0: did run" ${RECIPIENTS} < ${FAILED}
else
  date| mailx -s ":`hostname`: $0: did run" ${RECIPIENTS}
fi

rm -f ${SNAPVAULT} > /dev/null 2>&1

