Revision 6740d3ab73923164703a0ba2b11fc0294e8ea12e authored by John Chilton on 04 July 2020, 14:16:28 UTC, committed by John Chilton on 04 July 2020, 14:16:28 UTC
1 parent 78273d2
check_galaxy.sh
#!/bin/sh
#set -xv
#
# Runs the scripts/check_galaxy.py script in a way that's easy to handle from cron
#
# defaults (note: default sleep is below since it depends on debug)
DEBUG=0
STAGGER=0
INTERVAL=3
MAIL=
PAGE=
NEWHIST=
BARDARG=0
# get commandline opts
while getopts dsi:l:m:p:n optname
do
case $optname in
d) DEBUG=1 ;;
s) STAGGER=1 ;;
i) INTERVAL=$OPTARG ;;
l) SLEEP=$OPTARG ;;
m) MAIL="$MAIL $OPTARG" ;;
p) PAGE="$PAGE $OPTARG" ;;
n) NEWHIST="-n" ;;
*) BADARG=1 ;;
esac
done
shift `expr $OPTIND - 1`
if [ -z "$1" -o "$BADARG" ]; then
cat <<EOF
usage: `basename $0` [-ds] [-i interval] [-m email_address]+ [-p pager_address]+ <galaxy_host>"
-d Print debugging information.
-s Stagger mailing the pagers/emails, instead of all at once when
there's a problem. Useful for running check_galaxy at night.
-i <interval> The number of times this wrapper should execute before mailing
the next address, when staggering is enabled. Mail is sent
every <interval> runs of the program, so the actual time
between emails is:
time = (<interval>) * (how often wrapper runs from cron)
-l <seconds> This wrapper runs check_galaxy a second time if the first check
fails, in case the problem is intermittent. <seconds> is how
many seconds to sleep between checks.
-m <address> Email addresses to send the full check_galaxy output to, if
Galaxy is down. Use multiple -m options to specify multiple
addresses. When staggering, email will be sent in the order
which you specify -m options on the command line.
-p <address> Like -m, but sends just the last line of check_galaxy's output.
Useful for pagers. When staggering is enabled and both -m and
-p options are present, the first -m address and the first -p
address are mailed simultaneously, followed by the second -m
and second -p, and so on.
-n Create a new history (passes the -n option to check_galaxy.py).
<galaxy_host> The hostname of the Galaxy server to check. Use a : if running
on a non-80 port (e.g. galaxy.example.com:8080).
EOF
exit 1
fi
if [ -z "$SLEEP" ]; then
if [ $DEBUG ]; then
SLEEP=2
else
SLEEP=60
fi
fi
# globals
CRON_DIR=`dirname $0`
SCRIPTS_DIR="$CRON_DIR/../scripts"
CHECK_GALAXY="$SCRIPTS_DIR/check_galaxy.py"
VAR="$HOME/.check_galaxy"
# sanity
if [ ! -f $CHECK_GALAXY ]; then
[ $DEBUG = 1 ] && echo "$CHECK_GALAXY is missing"
exit 0
fi
# Do any other systems' default ps not take BSD ps args?
case `uname -s` in
SunOS) PS="/usr/ucb/ps" ;;
*) PS="ps" ;;
esac
NOTIFIED_MAIL="$VAR/$1/mail"
NOTIFIED_PAGE="$VAR/$1/page"
MUTEX="$VAR/$1/wrap.mutex"
COUNT="$VAR/$1/wrap.count"
STAGGER_FILE="$VAR/$1/wrap.stagger"
for dir in $VAR/$1 $NOTIFIED_MAIL $NOTIFIED_PAGE; do
if [ ! -d $dir ]; then
mkdir -p -m 0700 $dir
if [ $? -ne 0 ]; then
[ $DEBUG = 1 ] && echo "unable to create dir: $dir"
exit 0
fi
fi
done
if [ ! -f "$VAR/$1/login" ]; then
[ $DEBUG = 1 ] && cat <<EOF
Please create the file:
$VAR/$1/login
This should contain a username and password to log in to
Galaxy with, on one line, separated by whitespace, e.g.:
check_galaxy@example.com password
If the user does not exist, check_galaxy will create it
for you.
EOF
exit 0
fi
if [ $STAGGER ]; then
if [ -f "$STAGGER_FILE" ]; then
STAGGER_COUNT=`cat $STAGGER_FILE`
else
STAGGER_COUNT=$INTERVAL
fi
fi
# only run one at once
if [ -f $MUTEX ]; then
pid=`cat $MUTEX`
$PS p $pid >/dev/null 2>&1
if [ $? -eq 0 ]; then
if [ -f $COUNT ]; then
count=`cat $COUNT`
else
count=0
fi
if [ "$count" -eq 3 ]; then
echo "A check_galaxy process for $1 has been running for an unusually long time. Something is broken." \
| mail -s "$1 problems" $MAIL
fi
expr $count + 1 > $COUNT
exit 0
else
# stale mutex
rm -f $MUTEX
fi
fi
rm -f $COUNT
echo $$ > $MUTEX
[ $DEBUG = 1 ] && echo "running first check"
first_try=`$CHECK_GALAXY $NEWHIST $1 2>&1`
if [ $? -ne 0 ]; then
# if failure, wait and try again
[ $DEBUG = 1 ] && echo "first check failed, sleeping $SLEEP seconds for second run"
sleep $SLEEP
else
# if successful
[ $DEBUG = 1 ] && echo "first check succeeded"
for file in $NOTIFIED_MAIL/* $NOTIFIED_PAGE/*; do
recip=`basename $file`
# the literal string including the * will be passed if the dir is empty
[ "$recip" = '*' ] && continue
echo "$1 is now okay" | mail -s "$1 OK" $recip
rm -f $file
[ $DEBUG = 1 ] && echo "up: mailed $recip"
done
rm -f $MUTEX $STAGGER_FILE
exit 0
fi
[ $DEBUG = 1 ] && echo "running second check"
second_try=`$CHECK_GALAXY $NEWHIST $1 2>&1`
if [ $? -ne 0 ]; then
[ $DEBUG = 1 ] && echo "second check failed"
if [ $STAGGER = 1 ]; then
if [ "$STAGGER_COUNT" -eq "$INTERVAL" ]; then
# send notification this run
echo 1 > $STAGGER_FILE
else
# don't send notification this run
[ $DEBUG = 1 ] && echo "$1 is down, but it's not time to send an email. STAGGER_COUNT was $STAGGER_COUNT"
expr $STAGGER_COUNT + 1 > $STAGGER_FILE
rm -f $MUTEX
exit 0
fi
fi
for recip in $MAIL; do
if [ ! -f "$NOTIFIED_MAIL/$recip" ]; then
cat <<HERE | mail -s "$1 problems" $recip
$second_try
HERE
touch "$NOTIFIED_MAIL/$recip"
[ $DEBUG = 1 ] && echo "dn: mailed $recip"
[ $STAGGER = 1 ] && break
fi
done
for recip in $PAGE; do
if [ ! -f "$NOTIFIED_PAGE/$recip" ]; then
cat <<HERE | tail -1 | mail -s "$1 problems" $recip
$second_try
HERE
touch "$NOTIFIED_PAGE/$recip"
[ $DEBUG = 1 ] && echo "dn: mailed $recip"
[ $STAGGER = 1 ] && break
fi
done
else
[ $DEBUG = 1 ] && echo "second check succeeded"
for file in $NOTIFIED_MAIL/* $NOTIFIED_PAGE/*; do
recip=`basename $file`
[ "$recip" = '*' ] && continue
echo "$1 is now okay" | mail -s "$1 OK" $recip
rm -f $file
[ $DEBUG = 1 ] && echo "up: mailed $recip"
done
rm -f $STAGGER_FILE
fi
rm -f $MUTEX
exit 0
Computing file changes ...