#! /bin/sh # # $Id: cluster_fmstart.sh,v 6.15.30.2 2006/04/26 19:32:49 castrv Exp $ Copyright (c) 2000-2005 EMC Corporation # # # Copyright (c) 2000-2005 EMC Corporation. # # All rights reserved. This is an UNPUBLISHED work, and # comprises proprietary and confidential information of EMC. # Unauthorized use, disclosure, and distribution are strictly # prohibited. Use, duplication, or disclosure of the software # and documentation by the U.S. Government is subject to # restrictions set forth in a license agreement between the # Government and EMC or other written agreement specifying # the Government's rights to use the software and any applicable # FAR provisions, such as FAR 52.227-19. # ***************************************************************** # * * # * Copyright (c) Digital Equipment Corporation, 2002 * # * * # * All rights reserved. Unpublished rights reserved under * # * the copyright laws of the United States. * # * * # * The software contained on this media is proprietary to * # * and embodies the confidential technology of Digital * # * Equipment Corporation. Possession, use, duplication or * # * dissemination of the software and media is authorized only * # * pursuant to a valid written license from Digital Equipment * # * Corporation. * # * * # * RESTRICTED RIGHTS LEGEND Use, duplication, or disclosure * # * by the U.S. Government is subject to restrictions as set * # * forth in Subparagraph (c)(1)(ii) of DFARS 252.227-7013, * # * or in FAR 52.227-19, as applicable. * # * * # ***************************************************************** # # # High Availability NetWorker probe method, used to check for the # proper functioning of a NetWorker agent operating under the Sun # Cluster FrameWork.. # # As part of Networker cluster configuration (see networker.cluster), # this script is registered as NetWorker's FM_START method. Using # 'pmfadm' the registration of the monitor procedure is a call to # this same script with an argument which distinguishes this script # as the actual probe vs. the FM_START method. # # # PROBE OPERATIONS: # # The probing process involves doing two levels of checks on the # NetWorker agent. First, we need to do a system level check to # determine whether or not the nsrd daemon, the NetWorker server, is # actually visible on the node. This is called the "Looks Alive" # cluster control operation. The next level of check is the cluster # control "Is Alive" operation. This operation utilizes 'nsrprobe' # to send a message to the NetWorker server. If the message channel # is not active, 'nsrprobe' will return a non-zero status and we've # determined that something isn't correct. It is after repeated # failed attempts of either of these two operations that the probe # action then (hueristically) chooses to initiate a failover. # # #--------------------------------------------------------------------- # Find processes # # INPUTS: # process_list= The list of processes names to find # OUTPUTS: # active_processes= The ps output of the processes we targeted. #--------------------------------------------------------------------- # find_processes() { if [ ${ARCH} = SOLARIS ]; then ps_output="`ps -ax | egrep 'nsr|save|nwadmin'`" elif [ ${ARCH} = DIGITAL ]; then ps_output="`ps -eopid,tt,time,comm | egrep 'nsr|save'`" else ps_output="`ps -e | egrep 'nsr|save|nwadmin'`" fi # build up an egrep style pattern we can feed to awk which will match # all the various daemons. pattern= for daemon in $process_list do if [ -z "${pattern}" ]; then pattern='[0-9] [-a-zA-Z\+\/._0-9]*('"${daemon}" else pattern="${pattern}|${daemon}" fi done pattern="${pattern}"')' if [ ! -z "${ps_output}" ]; then active_processes="`echo "${ps_output}" | awk '/awk/ { next } $0 ~ /'"$pattern"'/ { print $0 }'`" else active_processes= fi } # Parse program arguments. # parse_args() { while [ $# -gt 0 ] do case $1 in -probe ) ;; -R ) RESOURCE_NAME=$2 shift ;; -T ) RESOURCETYPE_NAME=$2 shift ;; -G ) RESOURCEGROUP_NAME=$2 shift ;; * ) ;; esac shift done } #--------------------------------------------------------------------------- # Check execution # Return "1" if NetWorker has been started as cluster server on # this node. #--------------------------------------------------------------------------- NW_CLUSTER_CHECK() { MYLH=$1 answer=0 if [ \( ${CLU_TYPE} = SC22 -a -d /${MYLH} \) -o ${CLU_TYPE} = SC30 ]; then # Need to check to see that we haven't already started nsrd # in cluster mode already; do that by looking at the arguments # to possible nsrd processes out there. process_list=nsrd find_processes if [ ! -z "${active_processes}" ]; then # Need to check that the NetWorker deamon is not # running as clustered data service. pattern="nsrd -k ${MYLH}" pid="`echo "${active_processes}" | egrep "${pattern}" | awk '{ print $1 }'`" if [ ! -z "${pid}" ]; then answer=1 fi fi fi return ${answer}; } # #--------------------------------------------------------------------- # Probe: # Execute the "Looks Alive" and "Is Alive" actions to determine # that the NetWorker server is acting properly. If not, return # non-zero value from this routine. # # By the Sun Cluster mechanisms, this is the bulk of the probe # method. It should spin in a loop indefinitely until the script # is shut down by the FM_STOP method. # # Returns "1" if the "Keeps Alive" action fails; "2" if the # "Is Alive" action fails; otherwise, should not return. # #--------------------------------------------------------------------- # NW_CLUSTER_PROBE() { count=`expr 0` MYLH=$1 retries=10 process_list=nsrd while true ; do # The "Looks Alive" action... count=`expr 0` while true ; do if [ ${count} -ge ${retries} ]; then return 1 fi NW_CLUSTER_CHECK ${MYLH} if [ $? -eq 0 ]; then # The NetWorker agent is not running on this node. # Attempt to start it up. # pmfadm in SC 3.0 sets the max file descriptor very high # (65536) which will cause a Segmentation Fault in nsrexecd. # Reset it to something lower (1024 or less)... HLIMIT=`ulimit -H -n` if [ ${HLIMIT} -gt 1024 ]; then ulimit -S -n 1024 else ulimit -S -n ${HLIMIT} fi if [ ${CLU_TYPE} = SC30 ]; then scha_resource_setstatus -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -s DEGRADED -m "daemons missing" fi ${MTHDDIR}/networker.start "${MYLH}" -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -T ${RESOURCETYPE_NAME} else # Check worked. Pop out of this loop break fi sleep 10 count=`expr ${count} + 1` done # Now, the "Is Alive" action... count=`expr 0` while true ; do if [ ${count} -ge ${retries} ]; then return 2 fi ${HATIMERUN} -t ${PROBE_TIMEOUT_SECS} ${LGBIN}/nsrlic -s ${MYLH} > /dev/null 2>&1 if [ $? -eq 0 ]; then # The probe worked. Pop out of this loop break; fi sleep 10 count=`expr ${count} + 1` done if [ ${CLU_TYPE} = SC30 ]; then scha_resource_setstatus -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -s OK -m "online, monitored" fi sleep ${PROBE_INTERVAL_SECS} done } # #--------------------------------------------------------------------- # Main Program: #--------------------------------------------------------------------- # uname=`uname` if [ "X${uname}" = "XSunOS" ]; then ARCH=SOLARIS # Sort out which cluster version pkginfo -q SUNWscmgr if [ $? -eq 0 ]; then # Sun Cluster 2.2 CLU_TYPE=SC22 fi pkginfo -q SUNWscr if [ $? -eq 0 ]; then # Sun Cluster 3.0 CLU_TYPE=SC30 fi pkginfo -q LGTOclnt if [ $? -eq 0 ]; then # EMC NetWorker NSR_STOP="/usr/sbin/nsr_shutdown -q" PRODUCT="NetWorker" LGBIN=/usr/sbin fi pkginfo -q SUNWebsc if [ $? -eq 0 ]; then # Sun StorEdge(TM) Enterprise Backup NSR_STOP="/usr/sbin/nsr/nsr_shutdown -q" PRODUCT="Sun StorEdge(TM) Enterprise Backup" LGBIN=/usr/sbin/nsr fi else exit 1 fi # NOTE: some systems put "Berkeley" utilities in /usr/ucb, others (e.g. SGI) # put them in /usr/bsd. Also, some systems use /usr/etc and other use # /usr/sbin. We include all variants in addition to the path to this # program to be safe. # mypath="`expr X\"${0}\" : X'\(.*\)/.*' \| X\"${0}\" : X'\(/\)[^/]*$' \| '.'`" PATH=/usr/ucb:/usr/bsd:/bin:/usr/bin:/etc:/usr/etc:/usr/sbin:$mypath:$PATH if [ X${ARCH} = "XDIGITAL" ]; then PATH=$PATH:/usr/opt/networker/bin elif [ ${CLU_TYPE} = SC30 ]; then PATH=$PATH:/usr/cluster/bin fi export PATH ARGV0=$0 if [ ${CLU_TYPE} = SC22 ]; then SCBIN="/opt/SUNWcluster/bin" HACTL=${SCBIN}/hactl HAGET=${SCBIN}/haget HATIMERUN=${SCBIN}/hatimerun CLUSTER_KEY=`${HACTL} -f cluster_key` SYSLOG_FACILITY=`${HAGET} -f syslog_facility` MYLH= MTHDDIR=${LGBIN} elif [ ${CLU_TYPE} = SC30 ]; then parse_args $* SCBIN=/usr/cluster/bin HATIMERUN=${SCBIN}/hatimerun SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY` MYLH=`scha_resource_get -G ${RESOURCEGROUP_NAME} -R ${RESOURCE_NAME} -O EXTENSION Servername | tail +2` if [ -z "${MYLH}" ] ; then MYLH=`scha_resource_get -G ${RESOURCEGROUP_NAME} -R ${RESOURCE_NAME} -O NETWORK_RESOURCES_USED` fi NSR_SHARED_DISK_DIR=`scha_resource_get -G ${RESOURCEGROUP_NAME} -R ${RESOURCE_NAME} -O EXTENSION Config_dir | tail +2` MTHDDIR=/usr/lib/nsr fi FMPROBE=nsrdfm if [ ! -z "$1" -a "$1" = "-probe" ]; then # Called as a period probe. Check for the proper functioning # of the second parameter and then scram. LH=$2 PROBE_TIMEOUT_SECS=60 PROBE_INTERVAL_SECS=60 if [ ${CLU_TYPE} = SC22 ]; then PATHPREFIX=`${HAGET} -f pathprefix -h ${LH}` CONFIG_DIR="${PATHPREFIX}/nsr" if [ -d ${CONFIG_DIR} ]; then MYLH=${LH} fi fi REGISTER="false" elif [ ${CLU_TYPE} = SC22 ]; then # Doing FM_START registration. Perform the functions to register # this script as NetWorker agent probe. REGISTER=true # Replace comma with space to form an sh word list: MASTERED_LOGICAL_HOSTS="`echo "$1" | tr ',' ' '`" # # Dynamically search the list of logical hosts which this physical # host currently masters, to see if one of them is the logical host # that NetWorker uses # for LH in ${MASTERED_LOGICAL_HOSTS} ; do # Map logical hostname to the administrative file system name: PATHPREFIX=`${HAGET} -f pathprefix -h ${LH}` CONFIG_DIR="${PATHPREFIX}/nsr" if [ -d ${CONFIG_DIR} ]; then MYLH=${LH} break fi done elif [ ${CLU_TYPE} = SC30 ]; then REGISTER=true parse_args $* fi if [ -z "${MYLH}" ]; then # This host does not currently master the logical host. # Nothing to do. exit 0 elif [ "${REGISTER}" = "true" ]; then # Register this script as the probe function with the correct # arguments to distinguish it as such ($1 = "-probe") pmfadm -c ${FMPROBE} -n 1 -t 1 ${ARGV0} -probe ${MYLH} -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -T ${RESOURCETYPE_NAME} else probes=`expr 0` while true ; do # When this routine finishes, it means that we've encountered # a systematic problem that requires us to request the SC soft- # ware to handle via failover. NW_CLUSTER_PROBE ${MYLH} if [ $? -eq 1 ]; then # This is bad. Can't even get the "Looks Alive" check # working. logger -p ${SYSLOG_FACILITY}.err \ "${FMPROBE}: Cleaning up non-existent NetWorker service" elif [ $? -eq 2 ]; then # Something else is wrong. Cannot seem to communicate to # the nsrd daemon. logger -p ${SYSLOG_FACILITY}.err \ "${FMPROBE}: Cannot communicate with NetWorker service" fi if [ ${CLU_TYPE} = SC30 ]; then scha_resource_setstatus -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -s FAULTED -m "Server unreachable" fi ${NSR_STOP} probes=`expr ${probes} + 1` logger -p ${SYSLOG_FACILITY}.err \ "${FMPROBE}:${probes}:: giving up mastery of ${MYLH}" if [ ${CLU_TYPE} = SC22 ]; then ${HACTL} -g -s networker -k ${CLUSTER_KEY} -l ${MYLH} else scha_control -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME -O GIVEOVER fi done fi # # Any non zero exit will be considered a failure. # exit 0