#! /bin/sh
#
# $Id: cluster_fmstart.sh,v 6.15.30.2 2006/04/26 19:32:49 castrv Exp $ Copyright (c) 2000-2005 EMC Corporation
#
#
# Copyright (c) 2000-2005 EMC Corporation.
#
# All rights reserved.  This is an UNPUBLISHED work, and
# comprises proprietary and confidential information of EMC.
# Unauthorized use, disclosure, and distribution are strictly
# prohibited.  Use, duplication, or disclosure of the software
# and documentation by the U.S. Government is subject to
# restrictions set forth in a license agreement between the
# Government and EMC or other written agreement specifying
# the Government's rights to use the software and any applicable
# FAR provisions, such as FAR 52.227-19.


# *****************************************************************
# *                                                               *
# *   Copyright (c) Digital Equipment Corporation, 2002           *
# *                                                               *
# *   All rights reserved.  Unpublished rights  reserved  under   *
# *   the copyright laws of the United States.                    *
# *                                                               *
# *   The software contained on this media  is  proprietary  to   *
# *   and  embodies  the  confidential  technology  of  Digital   *
# *   Equipment Corporation.  Possession, use,  duplication  or   *
# *   dissemination of the software and media is authorized only  *
# *   pursuant to a valid written license from Digital Equipment  *
# *   Corporation.                                                *
# *                                                               *
# *   RESTRICTED RIGHTS LEGEND   Use, duplication, or disclosure  *
# *   by the U.S. Government is subject to restrictions  as  set  *
# *   forth in Subparagraph (c)(1)(ii)  of  DFARS  252.227-7013,  *
# *   or  in  FAR 52.227-19, as applicable.                       *
# *                                                               *
# *****************************************************************
#
#
# High Availability NetWorker probe method, used to check for the
# proper functioning of a NetWorker agent operating under the Sun
# Cluster FrameWork..
#
# As part of Networker cluster configuration (see networker.cluster),
# this script is registered as NetWorker's FM_START method.  Using
# 'pmfadm' the registration of the monitor procedure is a call to
# this same script with an argument which distinguishes this script
# as the actual probe vs. the FM_START method.
#
#
# PROBE OPERATIONS:
#
# The probing process involves doing two levels of checks on the
# NetWorker agent.  First, we need to do a system level check to
# determine whether or not the nsrd daemon, the NetWorker server, is
# actually visible on the node.  This is called the "Looks Alive"
# cluster control operation.  The next level of check is the cluster
# control "Is Alive" operation.  This operation utilizes 'nsrprobe'
# to send a message to the NetWorker server.  If the message channel
# is not active, 'nsrprobe' will return a non-zero status and we've
# determined that something isn't correct.  It is after repeated
# failed attempts of either of these two operations that the probe
# action then (hueristically) chooses to initiate a failover.
#


#
#---------------------------------------------------------------------
# Find processes
#
# INPUTS:
#	process_list= 	  The list of processes names to find
# OUTPUTS:
#	active_processes= The ps output of the processes we targeted.
#---------------------------------------------------------------------
#
find_processes()
{
    if [ ${ARCH} = SOLARIS ]; then
	ps_output="`ps -ax | egrep 'nsr|save|nwadmin'`"
    elif [ ${ARCH} = DIGITAL ]; then
	ps_output="`ps -eopid,tt,time,comm | egrep 'nsr|save'`"
    else
	ps_output="`ps -e | egrep 'nsr|save|nwadmin'`"
    fi

    # build up an egrep style pattern we can feed to awk which will match
    # all the various daemons.
    pattern=
    for daemon in $process_list
    do
	if [ -z "${pattern}" ]; then
	    pattern='[0-9] [-a-zA-Z\+\/._0-9]*('"${daemon}"
	else
	    pattern="${pattern}|${daemon}"
	fi
    done
    pattern="${pattern}"')'

    if [ ! -z "${ps_output}" ]; then
	active_processes="`echo "${ps_output}" | awk '/awk/ { next }
			    $0 ~ /'"$pattern"'/ { print $0 }'`"
    else
	active_processes= 
    fi
}

# Parse program arguments.
#
parse_args()
{
    while [ $# -gt 0 ]
    do
	case $1 in
	    -probe )
		    ;;
	    -R )
		    RESOURCE_NAME=$2
		    shift
		    ;;
    	    -T )
   		    RESOURCETYPE_NAME=$2
    		    shift
    		    ;;
	    -G ) 
		    RESOURCEGROUP_NAME=$2
		    shift
		    ;;
	    * )
		    ;;
        esac
	shift
    done

}

#---------------------------------------------------------------------------
# Check execution
#    Return "1" if NetWorker has been started as cluster server on
#    this node.
#---------------------------------------------------------------------------
NW_CLUSTER_CHECK()
{
    MYLH=$1

    answer=0
    if [ \( ${CLU_TYPE} = SC22 -a -d /${MYLH} \) -o ${CLU_TYPE} = SC30 ]; then
	# Need to check to see that we haven't already started nsrd
	# in cluster mode already; do that by looking at the arguments
	# to possible nsrd processes out there.
	process_list=nsrd
	find_processes

	if [ ! -z "${active_processes}" ]; then
	    # Need to check that the NetWorker deamon is not
	    # running as clustered data service.
	    pattern="nsrd -k ${MYLH}"
	    pid="`echo "${active_processes}" | egrep "${pattern}" |
		    awk '{ print $1 }'`"

	    if [ ! -z "${pid}" ]; then
	    	    answer=1
	    fi
	fi
    fi

    return ${answer};
}


#
#---------------------------------------------------------------------
# Probe:
#    Execute the "Looks Alive" and "Is Alive" actions to determine
#    that the NetWorker server is acting properly.  If not, return
#    non-zero value from this routine.
#
#    By the Sun Cluster mechanisms, this is the bulk of the probe
#    method.  It should spin in a loop indefinitely until the script
#    is shut down by the FM_STOP method.
#
#    Returns "1" if the "Keeps Alive" action fails; "2" if the
#    "Is Alive" action fails; otherwise, should not return.
#
#---------------------------------------------------------------------
#
NW_CLUSTER_PROBE()
{
    count=`expr 0`
    MYLH=$1
    retries=10

    process_list=nsrd
    while true ; do

	# The "Looks Alive" action...
	count=`expr 0`
	while true ; do
	    if [ ${count} -ge ${retries} ]; then
		return 1
	    fi

            NW_CLUSTER_CHECK ${MYLH}
            if [ $? -eq 0 ]; then
	        # The NetWorker agent is not running on this node.
	        # Attempt to start it up.

		# pmfadm in SC 3.0 sets the max file descriptor very high
		# (65536) which will cause a Segmentation Fault in nsrexecd.
		# Reset it to something lower (1024 or less)...
		HLIMIT=`ulimit -H -n`
		if [ ${HLIMIT} -gt 1024 ]; then
			ulimit -S -n 1024
		else
			ulimit -S -n ${HLIMIT}
		fi
		if [ ${CLU_TYPE} = SC30 ]; then
			scha_resource_setstatus -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -s DEGRADED -m "daemons missing"
		fi
	        ${MTHDDIR}/networker.start "${MYLH}" -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -T ${RESOURCETYPE_NAME}
	    else
		# Check worked.  Pop out of this loop
		break
	    fi

	    sleep 10
	    count=`expr ${count} + 1`
	done


	# Now, the "Is Alive" action...
	count=`expr 0`
	while true ; do
	    if [ ${count} -ge ${retries} ]; then
		return 2
	    fi

	    ${HATIMERUN} -t ${PROBE_TIMEOUT_SECS} ${LGBIN}/nsrlic -s ${MYLH} > /dev/null 2>&1
	    if [ $? -eq 0 ]; then
		# The probe worked.  Pop out of this loop
		break;
	    fi

	    sleep 10
	    count=`expr ${count} + 1`
	done

	if [ ${CLU_TYPE} = SC30 ]; then
		scha_resource_setstatus -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -s OK -m "online, monitored"
	fi
	sleep ${PROBE_INTERVAL_SECS}
    done

}


#
#---------------------------------------------------------------------
# Main Program:
#---------------------------------------------------------------------
#

uname=`uname`
if [ "X${uname}" = "XSunOS" ]; then
    ARCH=SOLARIS
    # Sort out which cluster version
    pkginfo -q SUNWscmgr
    if [ $? -eq 0 ]; then       # Sun Cluster 2.2
	CLU_TYPE=SC22
    fi
    pkginfo -q SUNWscr
    if [ $? -eq 0 ]; then       # Sun Cluster 3.0
	CLU_TYPE=SC30
    fi
    pkginfo -q LGTOclnt
    if [ $? -eq 0 ]; then	# EMC NetWorker
	NSR_STOP="/usr/sbin/nsr_shutdown -q"
	PRODUCT="NetWorker"
	LGBIN=/usr/sbin
    fi
    pkginfo -q SUNWebsc
    if [ $? -eq 0 ]; then	# Sun StorEdge(TM) Enterprise Backup
	NSR_STOP="/usr/sbin/nsr/nsr_shutdown -q"
	PRODUCT="Sun StorEdge(TM) Enterprise Backup"
	LGBIN=/usr/sbin/nsr
    fi
else
    exit 1
fi


# NOTE: some systems put "Berkeley" utilities in /usr/ucb, others (e.g. SGI)
# put them in /usr/bsd.  Also, some systems use /usr/etc and other use
# /usr/sbin.  We include all variants in addition to the path to this
# program to be safe.
#
mypath="`expr X\"${0}\" : X'\(.*\)/.*' \| X\"${0}\" : X'\(/\)[^/]*$' \| '.'`"
PATH=/usr/ucb:/usr/bsd:/bin:/usr/bin:/etc:/usr/etc:/usr/sbin:$mypath:$PATH
if [ X${ARCH} = "XDIGITAL" ]; then
    PATH=$PATH:/usr/opt/networker/bin
elif [ ${CLU_TYPE} = SC30 ]; then
    PATH=$PATH:/usr/cluster/bin
fi
export PATH

ARGV0=$0

if [ ${CLU_TYPE} = SC22 ]; then
	SCBIN="/opt/SUNWcluster/bin"
	HACTL=${SCBIN}/hactl
	HAGET=${SCBIN}/haget
	HATIMERUN=${SCBIN}/hatimerun
	CLUSTER_KEY=`${HACTL} -f cluster_key`
	SYSLOG_FACILITY=`${HAGET} -f syslog_facility`
	MYLH=
	MTHDDIR=${LGBIN}
elif [ ${CLU_TYPE} = SC30 ]; then
	parse_args $*
	SCBIN=/usr/cluster/bin
	HATIMERUN=${SCBIN}/hatimerun
	SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY`
        MYLH=`scha_resource_get -G ${RESOURCEGROUP_NAME} -R ${RESOURCE_NAME} -O EXTENSION Servername | tail +2`
        if [ -z "${MYLH}" ] ; then
                MYLH=`scha_resource_get -G ${RESOURCEGROUP_NAME} -R ${RESOURCE_NAME} -O NETWORK_RESOURCES_USED`
        fi
	NSR_SHARED_DISK_DIR=`scha_resource_get -G ${RESOURCEGROUP_NAME} -R ${RESOURCE_NAME} -O EXTENSION Config_dir | tail +2`
	MTHDDIR=/usr/lib/nsr
fi

FMPROBE=nsrdfm


if [ ! -z "$1" -a "$1" = "-probe" ]; then
    # Called as a period probe.  Check for the proper functioning
    # of the second parameter and then scram.
    LH=$2
    PROBE_TIMEOUT_SECS=60
    PROBE_INTERVAL_SECS=60

    if [ ${CLU_TYPE} = SC22 ]; then
	PATHPREFIX=`${HAGET} -f pathprefix -h ${LH}`
        CONFIG_DIR="${PATHPREFIX}/nsr"
        if [ -d ${CONFIG_DIR} ]; then
        	MYLH=${LH}
        fi
    fi
    REGISTER="false"
elif [ ${CLU_TYPE} = SC22 ]; then
    # Doing FM_START registration.  Perform the functions to register
    # this script as NetWorker agent probe.
    REGISTER=true

    # Replace comma with space to form an sh word list:
    MASTERED_LOGICAL_HOSTS="`echo "$1" | tr ',' ' '`"

    #
    # Dynamically search the list of logical hosts which this physical
    # host currently masters, to see if one of them is the logical host
    # that NetWorker uses
    #
    for LH in ${MASTERED_LOGICAL_HOSTS} ; do
        # Map logical hostname to the administrative file system name:
        PATHPREFIX=`${HAGET} -f pathprefix -h ${LH}`
        CONFIG_DIR="${PATHPREFIX}/nsr"
        if [ -d ${CONFIG_DIR} ]; then
	    MYLH=${LH}
	    break
        fi
    done
elif [ ${CLU_TYPE} = SC30 ]; then
    REGISTER=true
    parse_args $*
fi

if [ -z "${MYLH}" ]; then
    # This host does not currently master the logical host.
    # Nothing to do.
    exit 0
elif [ "${REGISTER}" = "true" ]; then
    # Register this script as the probe function with the correct
    # arguments to distinguish it as such ($1 = "-probe")
    pmfadm -c ${FMPROBE} -n 1 -t 1 ${ARGV0} -probe ${MYLH} -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -T ${RESOURCETYPE_NAME}
else
    probes=`expr 0`
    while true ; do
        # When this routine finishes, it means that we've encountered
	# a systematic problem that requires us to request the SC soft-
	# ware to handle via failover.
        NW_CLUSTER_PROBE ${MYLH}

        if [ $? -eq 1 ]; then
	    # This is bad.  Can't even get the "Looks Alive" check
	    # working.
	    logger -p ${SYSLOG_FACILITY}.err \
	        "${FMPROBE}:  Cleaning up non-existent NetWorker service"
        elif [ $? -eq 2 ]; then
	    # Something else is wrong.  Cannot seem to communicate to
	    # the nsrd daemon.
	    logger -p ${SYSLOG_FACILITY}.err \
	        "${FMPROBE}:  Cannot communicate with NetWorker service"
        fi
	if [ ${CLU_TYPE} = SC30 ]; then
		scha_resource_setstatus -R ${RESOURCE_NAME} -G ${RESOURCEGROUP_NAME} -s FAULTED -m "Server unreachable"
	fi
        ${NSR_STOP} 

	probes=`expr ${probes} + 1`
        logger -p ${SYSLOG_FACILITY}.err \
	        "${FMPROBE}:${probes}::  giving up mastery of ${MYLH}"

	if [ ${CLU_TYPE} = SC22 ]; then
		${HACTL} -g -s networker -k ${CLUSTER_KEY} -l ${MYLH}
	else
		scha_control -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME -O GIVEOVER
	fi
    done
fi


#
#  Any non zero exit will be considered a failure.
#
exit 0