#!/bin/ksh ############################################################## # $Id: nw_ux.lc.sh,v 6.18 2006/04/24 15:55:17 castrv Exp $ Copyright (c) 2004-2006 EMC Corporation. # # Copyright (c) 2004-2006 EMC Corporation. # # All rights reserved. This is an UNPUBLISHED work, and # comprises proprietary and confidential information of EMC. # Unauthorized use, disclosure, and distribution are strictly # prohibited. Use, duplication, or disclosure of the software # and documentation by the U.S. Government is subject to # restrictions set forth in a license agreement between the # Government and EMC or other written agreement specifying # the Government's rights to use the software and any applicable # FAR provisions, such as FAR 52.227-19. # ############################################################## # Note: # - search for "# Customise" below and adjust accordingly. Mandatory updating # is clearly indicated as "mandatory" ############################################################## # # Customise - optional #set -x # for detailed script tracing if [ "$1" != "" ]; then exec >/tmp/nw_ux.lc.log.$1 2>&1 else exec >/tmp/nw_ux.lc.log 2>&1 fi date echo $* ############################################################## #Wrapper templates for start, stop, and monitor logic #Place the nsr-specific calls within startxxx() { echo "startxxx: ENTRY" date echo $$ >$PIDFILE rc=0 #Start code ... #Could put check to ensure another instance not already running but #LC mgr should already ensure this. nsr_start #always returns after setting and returning rc code ($?) if [ "$MODE" = "all_in_one" -a "$rc" != "0" ]; then stopxxx #error: stopxxx will not return elif [ "$MODE" = "all_in_one" -a "$rc" = "0" ]; then return $rc #to enter monxxx state elif [ "$MODE" = "separate" ]; then exit $rc #assumes LC mgr will next call monxxx independently else echo "Bug!?!" #should never happen exit 2 fi } monxxx() { echo "monxxx: ENTRY" date rc=0 nsr_nsrd_start if [ "$MODE" = "all_in_one" -a "$rc" != "0" ]; then #got error, so end by calling stop logic #pre-stop "all_in_one" specifics here stopxxx elif [ "$MODE" = "separate" -a "$rc" != "0" ]; then #got error, so end by calling stop logic #pre-stop "separate" specifics here stopxxx fi while : do #sleep short interval to catch SIGTERM, SIGUSR1, etc. #do existence/response check; on error call cleanup #if error set rc=n>0 and break else: date nsr_mon #one-time check and sets & returns with rc code if [ "$rc" = "0" ]; then #forever check until error sleep $MON_SLEEP_SECS elif [ "$MODE" = "all_in_one" ]; then #got error, so end by calling stop logic #pre-stop "all_in_one" specifics here stopxxx elif [ "$MODE" = "separate" ]; then #got error, so end by calling stop logic #pre-stop "separate" specifics here stopxxx else echo "Bug!?!" #should never happen exit 2 fi done } stopxxx() { echo "stopxxx: ENTRY" date #any stop specific stuff here: shutdown, kill children, ... nsr_stop #Most useful: kill sibling instance in monitor mode if still exists: #this should already have been done by the LC mgr. if [ "${NWARCH}" = "LINUX" ]; then kill -KILL `ps -ef | grep "nw_ux.lc mon" | grep -v grep | awk '{ print $2 }'` 2>/dev/null else kill -KILL `/usr/bin/ps -ef | grep "nw_ux.lc mon" | grep -v grep | awk '{ print $2 }'` 2>/dev/null fi cleanup exit $rc } #aka general cleanup-shutdown stuff cleanup() { echo "cleanup: ENTRY" date #other general stuff rm -f $PIDFILE } #Could also have stop action for all_in_one mode, touch flag file which #monxxx regularly checks for. sigtermhndlr() { echo "sigtermhndlr: ENTRY" date stopxxx } ############################################################## #NW Specific function code #Inherited from previous NW cluster projects - not necessarily the most ideal # # INPUTS: # process_list= The list of processes names to find # OUTPUTS: # active_processes= The ps output of the processes we targeted. find_processes() { if [ "${NWARCH}" = "SOLARIS" ]; then ps_output="`ps -ax | egrep 'nsr|save|nwadmin'`" elif [ "${NWARCH}" = "DIGITAL" ]; then ps_output="`ps -eopid,tt,time,comm | egrep 'nsr|save|nwadmin'`" else ps_output="`ps -e | egrep 'nsr|save|nwadmin'`" fi # build up an egrep style pattern we can feed to awk which will match # all the various daemons. pattern= for daemon in $process_list ; do if [ -z "${pattern}" ]; then pattern='[0-9] [-a-zA-Z\+\/._0-9]*('"${daemon}" else pattern="${pattern}|${daemon}" fi done pattern="${pattern}"')' if [ ! -z "${ps_output}" ]; then active_processes="`echo "${ps_output}" | awk '/awk/ { next } $0 ~ /'"$pattern"'/ { print $0 }'`" else active_processes= fi } # Set up NetWorker to run as a client nsr_local() { echo "nsr_local: ENTRY" date # stop nsr services ${NSR_BIN}/${NSR_STOP} # redefine the sym links rm -f ${NSR_LINK} ln -s ${NSR_LOCAL} ${NSR_LINK} # start nsrexecd ${NSR_BIN}/nsrexecd sleep 5 process_list="nsrexecd" find_processes if [ -z "${active_processes}" ]; then nsr_log "Unable to start nsrexecd" rc=1 return $rc fi } nsr_log() { echo "nsr_log: ENTRY" date touch ${NSR_BIN}/${LOGFILE} chmod 666 ${NSR_BIN}/${LOGFILE} TIME=`date` # write the message echo "$TIME: $HOSTNAME $1" >> ${NSR_BIN}/${LOGFILE} } nsr_start() { echo "nsr_start: ENTRY" date nsr_log "Start action script for service networker" rc=0 # stop nsr services ${NSR_BIN}/${NSR_STOP} sleep 5 # check to see if nsr services are still running process_list="${NSR_DAEMONS}" find_processes if [ ! -z "${active_processes}" ]; then nsr_log "Unable to kill nsrd; could not stop Networker" rc=1 return $rc fi # make the shared database directory if [ ! -d "${NSR_SHARED_DISK_DIR}" ]; then nsr_log "Make the shared nsr mount point directory first:" nsr_log "${NSR_SHARED_DISK_DIR}" rc=1 return $rc fi if [ ! -d "/${NSR_SHARED_DISK_DIR}/nsr" ]; then mkdir "/${NSR_SHARED_DISK_DIR}/nsr" fi # redefine the sym links rm -f ${NSR_LINK} ln -s "${NSR_SHARED_DISK_DIR}/nsr" ${NSR_LINK} # start nsrexexcd ${NSR_BIN}/nsrexecd return $rc } nsr_nsrd_start() { echo "nsr_nsrd_start: ENTRY" date nsr_log "Starting nsrd" rc=0 #May want to query LC mgr to block here until NW is seen as registered. #This will prevent race conditions as happened elsewhere. sleep 15 if [ "${FT_DOMAIN}" != "" -a -x ${FT_DIR}/bin/ftcli ]; then retries=5 count=0 LC_VER="`${FT_DIR}/bin/ftcli -v | awk ' /Version/ { for (i = 1; i <= NF; i++) if ($i ~ /Version/) print $(i+1) } ' | awk -F. '{ print $1 }'`" while [ $count -lt $retries ]; do # assumes res group name is "networker" if [ "${LC_VER}" -ge "${LC_FIVE}" ]; then ${FT_DIR}/bin/ftcli -c "getRule networker" | awk ' /ft_CurrentState/ { print $2 } ' | grep -i online else ${FT_DIR}/bin/ftcli -c "getRule networker" | awk ' /Resource Group State/ { print $4 } ' | grep -i online fi if [ $? -eq 0 ]; then # networker is registered as running nsr_log "networker is registered as online" break fi nsr_log "networker not yet registered as online" sleep 5 count=`expr ${count} + 1` done fi # Customise accordingly for your site for all "rd=" device storage nodes - # mandatory if using "rd=phyhost:xxx" devices sn_list="" #hostname-cmd compatible format, e.g. "foo.abc.com bar.abc.com" for sn_i in $sn_list do # Local node already recycled earlier on, so skip if [ ! $sn_i = `hostname` ]; then nsr_log "Refreshing remote storage node $sn_i ..." #Choose appropriate remote shell cmd: #rsh $sn_i "${NSR_BIN}/nsr_shutdown -q && ${NSR_BIN}/nsrexecd" /nsr/logs/nsrd.log.$$ 2>&1 & ${NSR_BIN}/nsrd -k ${NSR_SERVICE_ID} sleep 15 # check to see if nsrd is running process_list="nsrd" find_processes if [ -z "${active_processes}" ]; then nsr_log "NetWorker failed to start." ${FT_DIR}/bin/ftcli -c "listUsers" | grep root | grep -i `uname -n` 2>&1 > /dev/null if [ $? -ne 0 ]; then nsr_log "Check that a root entry for the current node exists in the Security section under the Domain menu of the AAM management console. If not NW server nsrd won't start." fi rc=1 fi return $rc } nsr_stop() { echo "nsr_stop: ENTRY" date nsr_log "Stop action script for service networker" rc=0 # stop nsr services ${NSR_BIN}/${NSR_STOP} sleep 5 # check to see if nsr services are still running process_list="${NSR_DAEMONS}" find_processes if [ ! -z "${active_processes}" ]; then nsr_log "Unable to kill nsrd; could not stop Networker" rc=1 return $rc fi # Remove NetWorker GUI if running process_list="nwadmin" find_processes if [ ! -z "${active_processes}" ]; then pid=`echo "${active_processes}" | \ awk '$0 ~ /nwadmin/ { print $1 }'` nsr_log "Stopping the NetWorker GUI" kill -9 ${pid} >/dev/console 2>&1 fi # start local configuration nsr_local return $rc } nsr_var_init() { echo "nsr_var_init: ENTRY" date #XXX Cater later for Solaris vs HPUX vs AIX in future - uname rc=0 OS=`uname -s` if [ "$OS" = "SunOS" ]; then NWARCH=SOLARIS NSR_BIN=/usr/sbin elif [ "$OS" = "HP-UX" ]; then NWARCH=HPUX NSR_BIN=/opt/networker/bin elif [ "$OS" = "Linux" ]; then NWARCH=LINUX NSR_BIN=/usr/sbin elif [ "$OS" = "AIX" ]; then NWARCH=AIX NSR_BIN=/usr/bin fi PATH=/usr/ucb:/usr/bsd:/bin:/usr/bin:/etc:/usr/etc:/usr/sbin:\ /sbin:/usr/bin:${PATH}:${NSR_BIN} export PATH NSR_LOCAL=/nsr.NetWorker.local NSR_LINK=/nsr #always a sym link # Customise next two - could manually update here ... #post-install nw cluster config script networker.cluster updates next two: NSR_SHARED_DISK_DIR=user_defined NSR_SERVICE_ID=user_defined NSR_STOP="nsr_shutdown -q" NSR_DAEMONS="nsrd nsrexecd nsrmmdbd nsrindexd" HOSTNAME=`uname -n` LOGFILE="nw_ux.lc.log" # Variable to encapsulate debug nsr_logs to be turned on for debugging. # Leave as DEBUG_ON="" if not debugging (ie. Normal operation). DEBUG_ON="" LC_VER="" LC_FIVE="5" } nsr_mon() { echo "nsr_mon: ENTRY" date if [ ! -z "${DEBUG_ON}" ]; then nsr_log "Probing networker daemons" fi rc=0 # Looks alive? # nsrd and its immediate children should all be up in less than 20 mins - # but could still increase retries and networker.cap values as desired retries=20 count=0 while true ; do process_list="nsrd" find_processes nsrd_active="${active_processes}" process_list="nsrindexd" find_processes nsrindexd_active="${active_processes}" process_list="nsrmmdbd" find_processes nsrmmdbd_active="${active_processes}" if [ ! -z "${nsrd_active}" -a ! -z "${nsrindexd_active}" \ -a ! -z "${nsrmmdbd_active}" ]; then break fi if [ ${count} -ge ${retries} ]; then nsr_log "NW Server looks dead" rc=1 return $rc fi sleep 60 count=`expr ${count} + 1` done # dump_perf_sys_info # Customise - optional # The response test is now implemented as a State Monitor in LAAM return $rc } #-------------------------------------------------------------------- # Optional System Performance Monitoring & Logging #-------------------------------------------------------------------- dump_perf_sys_info() { echo "dump_perf_sys_info start @ `date`:" #time rpcinfo -t `hostname` 390103 #nsrd rpc prog# to ping # one for each nsrd failover node and the cluster name itself: #time nslookup dns_nis_server #time ping -c 2 dns_nis_server #time nslookup each_clu_node #time nslookup xxx #time netstat -i #time ifconfig -a #time df -k #time mount #time ps -ef | egrep "nsr|save|recover|nw" # Needs unstripped nsrd binary and functional/licensed dbx/gdb to # get nsrd snapshots: NSRD_PID=`ps -ef | egrep ' nsrd|/nsrd' | grep -v grep| egrep -v 'dbx|gdb' | awk '{print $2}'` echo NSRD_PID=$NSRD_PID # dbx: #echo "where\nquit" | #time /usr/bin/dbx -pid $NSRD_PID /usr/opt/networker/bin/nsrd # gdb: #echo "bt\nquit\ny" | #time /opt/networker/bin/gdb /opt/networker/bin/nsrd $NSRD_PID echo "dump_perf_sys_info end @ `date`:" } ############################################################## #main line echo "main line: ENTRY" date # Customise - 1200 secs for a heavy workload site: MON_SLEEP_SECS=1200 PIDFILE=/tmp/nw_ux.lc.pid #optional additional safety MODE= rc=0 #return code for return/exit primitives: 0-ok 1+-error nsr_var_init case "$1" in #Mode#1 all_in_one mode #In LC: #res group start script: #res group (dummy wrapper) process start: /.../bin/nw_ux.lc start_mon #res group (dummy wrapper) process stop: /.../bin/nw_ux.lc stop_mon #res group stop script: start_mon) MODE=all_in_one #For interrupting monxxx forever loop: trap sigtermhndlr TERM QUIT INT HUP startxxx monxxx #not reached ;; stop_mon) MODE=all_in_one if [ "${NWARCH}" = "LINUX" ]; then kill -TERM `ps -ef | grep "nw_ux.lc start_mon" | grep -v grep | awk '{ print $2 }'` 2>/dev/null || kill -TERM `cat /tmp/nw_ux.lc.pid` #fallback else kill -TERM `/usr/bin/ps -ef | grep "nw_ux.lc start_mon" | grep -v grep | awk '{ print $2 }'` 2>/dev/null || kill -TERM `cat /tmp/nw_ux.lc.pid` #fallback fi ;; #Alternative mode#2 - Actual mode used by nw_ux.lc.imp & nw_ux.lc.aam5.imp template #In LC: #res group start perl script: system("/.../bin/nw_ux.lc start") && exit (1); #res group (dummy wrapper) process: /.../bin/nw_ux.lc mon #res group (dummy wrapper) process stop: #res group stop perl script: system("/.../bin/nw_ux.lc stop") && exit (1); start) MODE=separate #Fires up actual processes; calls stopxxx on error startxxx #not reached ;; #Alternative mode#2 mon) MODE=separate #Loops forever; exits $rc on error; exits on receiving SIGTERM et al. monxxx #not reached ;; #Alternative mode#2 stop) MODE=separate stopxxx #not reached ;; *) echo "Usage: nw_ux.lc start_mon|stop_mon | start|mon|stop" exit 1 ;; esac #not reached ##############################################################