#!/bin/ksh93

typeset -r VERSION='1.0' FPROG=${.sh.file} PROG=${FPROG##*/} SDIR=${FPROG%/*}

function showUsage {
    [[ -n $1 ]] && X='-?' ||  X='--man'
    getopts -a ${PROG} "${ print ${USAGE} ; }" OPT $X
}

# see https://docs.nvidia.com/datacenter/nvtags/1.1/nvtags-user-guide/index.html
typeset -Air LINK=(
	[SYS]=10	[NODE]=19	[PHB]=18	[PXB]=20	[PIX]=20
	[NV1]=25	[NV2]=25	[NV3]=25	[NV4]=25	[NV5]=25
	[NV6]=25	[NV12]=25
	[NET]=8		[X]=-1
)

function getLinks {
	typeset -n LINKS=$1 V=$3
	integer N=$2
	typeset L= T

	LINKS=
	for (( I=1; I <= N; I++ )); do
		T=${LINK[${V[I]}]}	
		[[ -n $T ]] && LINKS+=",$T" && continue
		[[ ${V[I]:0:2} == 'NV' ]] && LINKS+=",25" || LINKS+=',0'
	done
}

function doMain {
	
	integer N I SOCKTES=0 THREADS=0 CORES=0 MEM=0 OFFSET=0 M
	typeset -a A AM TYPE
	typeset S T U LAST_GPU GPU CLIST LAST_CPU
	typeset -l LC
	typeset -r NVSMI=${ whence nvidia-smi; }
	typeset -Ai GRES

	read -A A </proc/meminfo
	(( MEM = ${A[1]}/1024 - 4 * 1024 )) 	# OS - 4 GB
	if [[ -z ${NVSMI} ]]; then
		print -u2 '# nvidia-smi not found - assume no GPU node.'
	else
		# map GPU# to an alias for slurm GPU type (could be an arbitrary name)
		nvidia-smi -L | while read -A A ; do
			[[ $A == 'GPU' ]] || continue
			N=${A[1]%:}
			T=
			for (( I=2; I < ${#A[@]} - 2; I++ )); do
				S="${A[I]}"
				[[ $S == '(UUID:' ]] && break
				[[ $S == 'NVIDIA' || $S == 'GeForce' || $S == 'Tesla' ]] && \
					continue
				T+=".$S"
			done
			TYPE[N]="${T//-/.}"
		done
		# Get the CPU Affinity for each GPU and print a corresponding GPU
		# resource line suitable for /etc/slurm/gres.conf.
		print '# gres.conf\n#AutoDetect=nvml\nAutoDetect=Off'
		# explicit flag: If srun w/o option --gpus=N gets executed, the env
		# vars CUDA_VISIBLE_DEVICES and GPU_DEVICE_ORDINAL get not set at all,
		# otherwise they get set to all available GPUs, i.e. usually to
		# 0,1,2,3,4,5,6,7.
		M=0
		LAST_GPU=
		nvidia-smi topo -m | while read -A A ; do
			if [[ ${A:0:3} != 'GPU' ]]; then
				(( M )) && break
				for (( I=1; I < ${#A[@]}; I++ )); do
					[[ ${A[I]:0:3} == 'GPU' ]] && continue
					M=$I
					break
				done
				# nvidia-smi topo -m is version dependend - find out which
				# column has CPU
				for ((I=M ; I < ${#A[@]}; I++ )); do
					[[ ${A[I]} == 'CPU' ]] && (( OFFSET = I + 1 )) && break
				done
				continue
			fi 
			getLinks T $M A
			if [[ ${LAST_GPU} != ${TYPE[${A:3}]:1} ]]; then
				LAST_GPU="${TYPE[${A:3}]:1}"
				print "# GPU: ${LAST_GPU}"
				AM=( ${LAST_GPU//./ } );
				if [[ ${AM} == 'RTX' ]]; then
					GPU="rtx.${AM[1]}"	# ignores Ti etc. trailer
				elif [[ ${AM} =~ ^[A-Z][0-9]+$ ]]; then
					LC=${AM:0:1}
					GPU="${LC}.${AM:1}"	# ignore SXM2.32GB etc. trailer
				else
					LC=${AM}
					GPU="${LC}."
					LC=${AM[1]}
					GPU+="${LC}"
				fi
			fi
			(( GRES[${GPU}]++ ))
			U=( ${A[OFFSET]//,/ } )
			CLIST="$U"
			if (( ${#U[@]} > 1 )) && [[ ${LAST_CPU} != ${A[OFFSET]} ]]; then
				# slurm now wants Cores, not CPUs (alias strands) anymore.
				# So assume x86 and ignore the second range, which usally
				# refers to strand#2 of the related cores. The physical ID of
				# strand#1 denotes the logical ID of the related core, which is
				# used by slurm.
				LAST_CPU="${A[OFFSET]}"
				print "# CPU: ${LAST_CPU}"
			fi
			print "NodeName=${HOST} Name=gpu Type=${GPU}" \
				"File=/dev/nvidia${A:3} Links=${T:1} Cores=${CLIST}" \
				"Flags=nvidia_gpu_env,opencl_env,explicit"
		done
	fi
	lscpu | while read -A A ; do
		[[ $A == 'Socket(s):' ]] && SOCKETS=${A[1]} && continue
		[[ $A == 'Thread(s)' ]] && THREADS=${A[3]} && continue
		[[ $A == 'Core(s)' ]] && CORES=${A[3]} && continue
	done
	# To avoid "A line in gres.conf for GRES gpu:$TYPE has $N more configured \
	# than expected in slurm.conf. Ignoring extra GRES." the type in gres must
	# appear as Gres=gpu:$[TYPE}:M too! gpu:$[TYPE}:n may appear more than once
	# separated by a comma.
	U=
	for S in ${!GRES[@]} ; do
		U+=",$S:${GRES[$S]}"
	done
	print "\n# slurm.conf" \
		"\nNodeName=${HOST} Gres=gpu:${U:1} Sockets=${SOCKETS}" \
		"CoresPerSocket=${CORES} ThreadsPerCore=${THREADS} RealMemory=${MEM}" \
		"State=UNKNOWN"

	#srun -N 1 --gres=gpu:4 env | egrep 'CUDA|GPU'
	#srun --gpus=1 --mem=10G --pty bash
}

USAGE="[-?${VERSION}"' ]
[-copyright?Copyright (c) 2023 Jens Elkner. All rights reserved.]
[-license?CDDL 1.0]
[+NAME?'"${PROG}"' - script to generate a slurm gres.conf]
[+DESCRIPTION?This simple script generates a gres.conf as well as slurm.conf NodeName snippet and prints it to the standard output for the zone the script is running on. For GPU detection nvidia-smi should be in your \bPATH\b (for now Nvidia GPUs are supported by this script, only).]
[h:help?Print this help and exit.]
[F:functions?Print a list of all functions available.]
[T:trace]:[functionList?A comma separated list of functions of this script to trace (convinience for troubleshooting).] 
'

X="${ print ${USAGE} ; }"
while getopts "${X}" OPT ; do
	case ${OPT} in
		h) showUsage ; exit 0 ;;
		T)	if [[ ${OPTARG} == 'ALL' ]]; then
				typeset -ft ${ typeset +f ; }
			else
				typeset -ft ${OPTARG//,/ }
			fi
			;;
		F) typeset +f && exit 0 ;;
		*) showUsage 1 ; exit 1 ;;
	esac
done

X=$((OPTIND-1))
shift $X && OPTIND=1
unset X

doMain "$@"

# vim: ts=4 sw=4 filetype=sh
