#!/bin/bash APP_SVNID='$HeadURL$ $LastChangedRevision$' . $(ade-config ade_share_prefix)/include/ade.sh || { echo "${0##*/}: INTERNAL ERROR: failed to load ade.sh" >&2; exit 3; } ROCON_CHECK_DEFINED_ERRORS=( "KEY=ROCON_CHECK_ERR_MISC; FMT=\"%s\"" ) ROCON_CMD=$(dirname $0)/rocon rocon_check() { local ERRSTACK_REF="$1"; shift local -a DOLLARAT local OPTVAL LISTPATHS RC TEMP PROGNAME # Register application-specific errors ade_register_error_types ROCON_CHECK_DEFINED_ERRORS # Defaults for options ade_get_progname "$ERRSTACK_REF" PROGNAME OPT_REPORT_MODE=mail OPT_MAILTO=root OPT_CFGFILE=$(rocon-config rocon_etc_prefix)/$PROGNAME.conf OPT_ROCONCMD=$(dirname $0)/rocon # Register rocon-check's options ade_register_options "$ERRSTACK_REF" -o f: --longoptions=config-file:,report-mode:,mail-to: --callback-template="rocon_check_opt_handler_%s" || return $? ade_set_callbacks "$ERRSTACK_REF" rocon_check_usage_help rocon_check_version rocon_check_paths || return $? # Process options ade_process_options "$ERRSTACK_REF" NEW_DOLLAR_AT "$@" || return $? set -- "${NEW_DOLLAR_AT[@]}" # Process arguments [ $# = 0 ] || ade_show_bad_usage "$ERRSTACK_REF" # Sanity checks and derivations TOUCHEDBYSERVER_FILE=/tmp/$PROGNAME.touch WAITINPROGRESS_FILE=/tmp/$PROGNAME.grace-period-running # Are we a server, a client or both? [ -f $OPT_CFGFILE -a -r $OPT_CFGFILE ] || { ade_error "$ERRSTACK_REF" ROCON_CHECK_ERR_MISC "$OPT_CFGFILE: not accessible" return $ADE_FAIL } sh -c ". $OPT_CFGFILE" > /dev/null 2>&1 || { ade_error "$ERRSTACK_REF" ROCON_CHECK_ERR_MISC "$OPT_CFGFILE: not loadable" return $ADE_FAIL } # Load configuration file . $OPT_CFGFILE ROCON_CHECK_SERVER=${ROCON_CHECK_SERVER:-false} ade_debug "$ERRSTACK_REF" 10 "main: ROCON_CHECK_SERVER=$ROCON_CHECK_SERVER" ROCON_CHECK_CLIENT=${ROCON_CHECK_CLIENT:-true} ade_debug "$ERRSTACK_REF" 10 "main: ROCON_CHECK_CLIENT=$ROCON_CHECK_CLIENT" [ $ROCON_CHECK_SERVER = true -o $ROCON_CHECK_SERVER = false ] || { ade_error "$ERRSTACK_REF" ROCON_CHECK_ERR_MISC "$OPT_CFGFILE: ROCON_CHECK_SERVER is not 'true' or 'false' or undefined" return $ADE_FAIL } [ $ROCON_CHECK_CLIENT = true -o $ROCON_CHECK_CLIENT = false ] || { ade_error "$ERRSTACK_REF" ROCON_CHECK_ERR_MISC "$OPT_CFGFILE: ROCON_CHECK_CLIENT is not 'true' or 'false' or undefined" return $ADE_FAIL } ! $ROCON_CHECK_SERVER || [ "X$ROCON_CHECK_SERVER_CLIENT_SELECTOR_EXPRESSION" != X ] || { ade_error "$ERRSTACK_REF" ROCON_CHECK_ERR_MISC "$OPT_CFGFILE: ROCON_CHECK_SERVER_CLIENT_SELECTOR_EXPRESSION: not defined" return $ADE_FAIL } # Is report mode valid ade_debug "$ERRSTACK_REF" 10 "OPT_REPORT_MODE=$OPT_REPORT_MODE" case $OPT_REPORT_MODE in mail|stdout) : ;; *) ade_show_bad_usage "$ERRSTACK_REF" ;; esac # Guts ! $ROCON_CHECK_SERVER || rocon_check_server "$ERRSTACK_REF" || return $? ! $ROCON_CHECK_CLIENT || rocon_check_client "$ERRSTACK_REF" || return $? # If we get this far everything is okay. return $ADE_OK } rocon_check_server() { local ERRSTACK_REF="$1"; shift local CMDLINE VERBOSELEVEL # We pass on our verbosity to rocon. ade_get_verboselevel "$ERRSTACK_REF" VERBOSELEVEL || return $? ade_info "$ERRSTACK_REF" "performing server checks ..." # Can we reach everything which the rocon DB says we should be able to reach? Report anything # which we could not reach. ade_debug "$ERRSTACK_REF" 10 "rocon_check_server: using rocon to get hosts to say 'OK' ..." CMDLINE="$ROCON_CMD -c \"echo OK\" $ROCON_CHECK_SERVER_CLIENT_SELECTOR_EXPRESSION" ade_debug "$ERRSTACK_REF" 10 "rocon_check_server: CMDLINE=[$CMDLINE]" eval "$CMDLINE" 2>&1 | egrep -v '^[a-z][-a-z0-9]*\[stdout\]:[ \t]*OK$' > /tmp/$PROGNAME.$$ [ "X$(stat -c '%s' /tmp/$PROGNAME.$$)" = X0 ] || report <<-EOF The following errors occurred: $(cat /tmp/$PROGNAME.$$) To fix these: 1) Log in to $(uname -n). 2) Run "ssh pwd". 3) Examine the error messages. 4) Act accordingly; typically these means doing one of the following: i) turn on, or ii) remove an old SSH host key from ~/.ssh/known_hosts, or iii) accept the addition of a new SSH host key to ~/.ssh/known_hosts, or iv) remove an obsolete entry from the rocon database (use "rocon -e"). EOF rm -f /tmp/$PROGNAME.$$ # Secondly, clients make their own checks; they check that $TOUCHEDBYSERVER_FILE is # being regularly touched. They'll complain if it's not touched. So touch it. But # we don't care about if these fail (connection errors were reported above). ade_debug "$ERRSTACK_REF" 10 "rocon_check_server: using rocon to touch file on hosts ..." $ROCON_CMD --debug=$VERBOSELEVEL -c "touch $TOUCHEDBYSERVER_FILE" $ROCON_CHECK_SERVER_CLIENT_SELECTOR_EXPRESSION >/dev/null 2>&1 || true return $ADE_OK } rocon_check_opt_handler_f() { rocon_check_opt_handler_config_file "$@" } rocon_check_opt_handler_report_mode() { local ERRSTACK_REF="$1"; shift OPT_REPORT_MODE="$1" return $ADE_OK } rocon_check_opt_handler_mail_to() { local ERRSTACK_REF="$1"; shift OPT_MAILTO="$1" return $ADE_OK } rocon_check_opt_handler_config_file() { local ERRSTACK_REF="$1"; shift OPT_CFGFILE="$1" return $ADE_OK } rocon_check_client() { local ERRSTACK_REF="$1"; shift ade_info "$ERRSTACK_REF" "performing client checks ..." ade_debug "$ERRSTACK_REF" 10 "rocon_check_client: OPT_REPORT_MODE=$OPT_REPORT_MODE, TOUCHEDBYSERVER_FILE=$TOUCHEDBYSERVER_FILE, WAITINPROGRESS_FILE=$WAITINPROGRESS_FILE" # If the server has not created its file here and the file which indicates that we have # started waiting 3 days does not exist, then create the file which indicates we have # started waiting 3 days. if [ ! -f $TOUCHEDBYSERVER_FILE ] && [ ! -f $WAITINPROGRESS_FILE ]; then ade_debug "$ERRSTACK_REF" 10 "rocon_check_client: touch file does not exist and nor does grace file; starting grace period ..." touch $WAITINPROGRESS_FILE return $ADE_OK # If the server has not created its file here but we have not yet waited 3 days then # do nothing. elif [ ! -f $TOUCHEDBYSERVER_FILE ] && [ $(fileage $WAITINPROGRESS_FILE) -le 259000 ]; then ade_debug "$ERRSTACK_REF" 10 "rocon_check_client: touch file does not exist but grace period is running; nothing to do yet" return $ADE_OK # If the server has not created its file here then (by excluding above-handled possibilities) # we have waited 3 days so we need to flag the error. (We'll fall through into the reporting # code by not returning zero here.) elif [ ! -f $TOUCHEDBYSERVER_FILE ]; then ade_debug "$ERRSTACK_REF" 10 "rocon_check_client: touch file does not exist and grace period is over; will report in a moment" # So the server has created its file. If it's "fresh" then there is no error (either because # it touch it within the last 24 hours or because it didn't touch it within the last 24 hours # but we have not yet waited 3 days). elif [ $(fileage $TOUCHEDBYSERVER_FILE) -le 259000 ]; then ade_debug "$ERRSTACK_REF" 10 "rocon_check_client: touch file exists and is fresh or insufficiently stale; nothing to do" return $ADE_OK # Otherwise the file exists and is "stale". (We'll fall through into the reporting # code by not returning zero here.) else ade_debug "$ERRSTACK_REF" 10 "rocon_check_client: toch file exists but is stale; will report in a moment" fi # Here we report the error. report <<-EOF The following errors occurred: server not accessing client $HOSTNAME To fix this: 1) Log in to rocon server. 2) Run "ssh $HOSTNAME uptime" and check it works without interaction 3) If it works without interaction and the uptime suggests it has been turned on since this mail was sent, then either ensure it is up all the time (by leaving it on, contacting its user, etc) or remove the system from your rocon configuration. 4) If it does not work without interaction then repeat the command and make sure that it *now* works without interaction. 5) Run "rocon -e" and check there is one and only one entry for $HOSTNAME. EOF return $ADE_OK } fileage() { local FILEAGE FILE FILE=$1 FILEAGE=$(perl -e 'printf "%d\n", time - (stat($ARGV[0]))[9];' "$FILE") ade_debug "$ERRSTACK_REF" 10 "fileage: FILE=$FILE, FILEAGE=$FILEAGE" echo $FILEAGE } rocon_check_usage_help() { local ERRSTACK_REF="$1"; shift local USAGE_TEXT_SHORT_REF="$1"; shift local USAGE_TEXT_LONG_REF="$1"; shift eval "$USAGE_TEXT_SHORT_REF=" eval "$USAGE_TEXT_LONG_REF=\"\ --report-mode={stdout|mail} choose report mode\"" return $ADE_OK } rocon_check_paths() { local ERRSTACK_REF="$1"; shift local PATHLIST_REF=$1; shift eval "$PATHLIST_REF=\"\"" return $ADE_OK } rocon_check_version() { local ERRSTACK_REF="$1"; shift local VERSION_REF=$1; shift ade_extract_version "$ERRSTACK_REF" "$APP_SVNID" "$VERSION_REF" return $ADE_OK } report() { local ERRSTACK_REF="$1"; shift local PROGNAME # Sanity checks and derivations ade_get_progname "$ERRSTACK_REF" PROGNAME if [ $OPT_REPORT_MODE = stdout ]; then cat else mailx -s "$PROGNAME($(uname -n)) report" $OPT_MAILTO fi return $ADE_OK } ade_main rocon_check "$@"