#!/bin/bash -eExl
#
# Testing script for OpenUCX, to run from Jenkins CI
#
# Copyright (C) Mellanox Technologies Ltd. 2001-2017.  ALL RIGHTS RESERVED.
# Copyright (C) ARM Ltd. 2016-2018.  ALL RIGHTS RESERVED.
#
# See file LICENSE for terms.
#
#
# Environment variables set by Jenkins CI:
#  - WORKSPACE           : path to work dir
#  - BUILD_NUMBER        : jenkins build number
#  - JOB_URL             : jenkins job url
#  - EXECUTOR_NUMBER     : number of executor within the test machine
#  - JENKINS_RUN_TESTS   : whether to run unit tests
#  - RUN_TESTS           : same as JENKINS_RUN_TESTS, but for Azure
#  - JENKINS_TEST_PERF   : whether to validate performance
#  - JENKINS_NO_VALGRIND : set this to disable valgrind tests
#
# Optional environment variables (could be set by job configuration):
#  - nworkers : number of parallel executors
#  - worker   : number of current parallel executor
#  - COV_OPT  : command line options for Coverity static checker
#

WORKSPACE=${WORKSPACE:=$PWD}
ucx_inst=${WORKSPACE}/install
CUDA_MODULE="dev/cuda11.4"
GDRCOPY_MODULE="dev/gdrcopy2.3_cuda11.4"

if [ -z "$BUILD_NUMBER" ]; then
	echo "Running interactive"
	BUILD_NUMBER=1
	WS_URL=file://$WORKSPACE
	JENKINS_RUN_TESTS=yes
	JENKINS_TEST_PERF=1
	TIMEOUT=""
	TIMEOUT_VALGRIND=""
else
	echo "Running under jenkins"
	WS_URL=$JOB_URL/ws
	TIMEOUT="timeout 200m"
	TIMEOUT_VALGRIND="timeout 240m"
fi


#
# Set affinity to 2 cores according to Jenkins executor number.
# Affinity is inherited from agent in Azure CI.
# TODO: remove or rename after CI migration.
#
if [ -n "$EXECUTOR_NUMBER" ] && [ -n "$JENKINS_RUN_TESTS" ]
then
	AFFINITY="taskset -c $(( 2 * EXECUTOR_NUMBER ))","$(( 2 * EXECUTOR_NUMBER + 1))"
else
	AFFINITY=""
fi

#
# Parallel build command runs with 4 tasks, or number of cores on the system,
# whichever is lowest
#
num_cpus=$(lscpu -p | grep -v '^#' | wc -l)
[ -z $num_cpus ] && num_cpus=1
parallel_jobs=4
[ $parallel_jobs -gt $num_cpus ] && parallel_jobs=$num_cpus
num_pinned_threads=$(nproc)
[ $parallel_jobs -gt $num_pinned_threads ] && parallel_jobs=$num_pinned_threads

MAKE="make"
MAKEP="make -j${parallel_jobs}"
export AUTOMAKE_JOBS=$parallel_jobs

have_ptrace=$(capsh --print | grep 'Bounding' | grep ptrace || true)

#
# Set initial port number for client/server applications
#
server_port=$((10000 + (1000 * EXECUTOR_NUMBER)))

#
# Override maven repository path, to cache the downloaded packages accross tests
#
export maven_repo=${WORKSPACE}/.deps

#
# Set up parallel test execution - "worker" and "nworkers" should be set by jenkins
#
if [ -z "$worker" ] || [ -z "$nworkers" ]
then
	worker=0
	nworkers=1
fi
echo "==== Running on $(hostname), worker $worker / $nworkers ===="

# Report an warning message to Azure pipeline
log_warning() {
	msg=$1
	test "x$RUNNING_IN_AZURE" = "xyes" && { azure_log_warning "${msg}" ; set -x; } || echo "${msg}"
}

# Report an error message to Azure pipeline
log_error() {
	msg=$1
	test "x$RUNNING_IN_AZURE" = "xyes" && { azure_log_error "${msg}" ; set -x; } || echo "${msg}"
}

#
# cleanup ucx
#
make_clean() {
	rm -rf ${ucx_inst}
	$MAKEP ${1:-clean}
}

#
# Test if an environment module exists and load it if yes.
# Otherwise, return error code.
#
module_load() {
	set +x
	module=$1
	m_avail="$(module avail $module 2>&1)" || true

	if module avail -t 2>&1 | grep -q "^$module\$"
	then
		module load $module
		set -x
		return 0
	else
		set -x
		return 1
	fi
}

#
# Safe unload for env modules (even if it doesn't exist)
#
module_unload() {
	module=$1
	module unload "${module}" || true
}

#
# try load cuda modules if nvidia driver is installed
#
try_load_cuda_env() {
	num_gpus=0
	have_cuda=no
	have_gdrcopy=no
	if [ -f "/proc/driver/nvidia/version" ]; then
		have_cuda=yes
		have_gdrcopy=yes
		module_load $CUDA_MODULE    || have_cuda=no
		module_load $GDRCOPY_MODULE || have_gdrcopy=no
		num_gpus=$(nvidia-smi -L | wc -l)
	fi
}

unload_cuda_env() {
	module_unload $CUDA_MODULE
	module_unload $GDRCOPY_MODULE
}

#
# Check whether this test should do a task with given index,
# according to the parallel test execution parameters.
#
should_do_task() {
	set +x
	task=$1
	ntasks=$2
	tasks_per_worker=$(( (ntasks + nworkers - 1) / nworkers ))
	my_tasks_begin=$((tasks_per_worker * worker))
	my_tasks_end=$((my_tasks_begin + tasks_per_worker))

	# set return value to 0 (success) iff ($my_tasks_begin <= $task < $my_tasks_end)
	[ $task -ge $my_tasks_begin ] && [ $task -lt $my_tasks_end ]
	rc=$?
	set -x
	return $rc
}

#
# Do a given task only if the current worker is supposed to do it.
#
do_distributed_task() {
	set +x
	task=$1
	ntasks=$2
	shift 2
	if should_do_task $task $ntasks
	then
		echo "==== Running '$@' (task $task/$ntasks) ===="
		set -x
		$@
	else
		echo "==== Skipping '$@' (task $task/$ntasks) ===="
		set -x
	fi
}

#
# Take a list of tasks, and return only the ones this worker should do
#
get_my_tasks() {
	set +x
	task_list=$@
	ntasks=$(echo $task_list|wc -w)
	task=0
	my_task_list=""
	for item in $task_list
	do
		should_do_task $task $ntasks && my_task_list="$my_task_list $item"
		task=$((task + 1))
	done
	echo $my_task_list
	set -x
}

#
# Get list IB devices
#
get_ib_devices() {
	state=$1
	device_list=$(ibv_devinfo -l | tail -n +2)
	for ibdev in $device_list
	do
		num_ports=$(ibv_devinfo -d $ibdev| awk '/phys_port_cnt:/ {print $2}')
		for port in $(seq 1 $num_ports)
		do
			if ibv_devinfo -d $ibdev -i $port | grep -q $state
			then
				echo "$ibdev:$port"
			fi
		done
	done
}

#
# Get IB devices on state Active
#
get_active_ib_devices() {
	get_ib_devices PORT_ACTIVE
}

#
# Check IB devices on state INIT
#
check_machine() {
	init_dev=$(get_ib_devices PORT_INIT)
	if [ -n "${init_dev}" ]
	then
		echo "${init_dev} have state PORT_INIT"
		exit 1
	fi
}

#
# Get list of active IP interfaces
#
get_active_ip_ifaces() {
	device_list=$(ip addr | awk '/state UP/ {print $2}' | sed s/:// | cut -f 1 -d '@')
	for netdev in ${device_list}
	do
		(ip addr show ${netdev} | grep -q 'inet ') && echo ${netdev} || true
	done
}

#
# Get IP addr for a given IP iface
# Argument is the IP iface
#
get_ifaddr() {
	iface=$1
	echo $(ip addr show ${iface} | awk '/inet /{print $2}' | awk -F '/' '{print $1}')
}

get_rdma_device_ip_addr() {
	if [ ! -r /dev/infiniband/rdma_cm  ]
	then
		return
	fi

	if ! which ibdev2netdev >&/dev/null
	then
		return
	fi

	iface=`ibdev2netdev | grep Up | awk '{print $5}' | head -1`
	if [ -n "$iface" ]
	then
		ipaddr=$(get_ifaddr ${iface})
	fi

	if [ -z "$ipaddr" ]
	then
		# if there is no inet (IPv4) address, escape
		return
	fi

	ibdev=`ibdev2netdev | grep $iface | awk '{print $1}'`
	node_guid=`cat /sys/class/infiniband/$ibdev/node_guid`
	if [ $node_guid == "0000:0000:0000:0000" ]
	then
		return
	fi

	echo $ipaddr
}

get_non_rdma_ip_addr() {
	if ! which ibdev2netdev >&/dev/null
	then
		return
	fi

	# get the interface of the ip address that is the default gateway (pure Ethernet IPv4 address).
	eth_iface=$(ip route show| sed -n 's/default via \(\S*\) dev \(\S*\).*/\2/p')

	# the pure Ethernet interface should not appear in the ibdev2netdev output. it should not be an IPoIB or
	# RoCE interface.
	if ibdev2netdev|grep -qw "${eth_iface}"
	then
		echo "Failed to retrieve an IP of a non IPoIB/RoCE interface"
		exit 1
	fi

	get_ifaddr ${eth_iface}
}

#
# Prepare build environment
#
prepare() {
	echo " ==== Prepare ===="
	env
	cd ${WORKSPACE}
	if [ -d build-test ]
	then
		chmod u+rwx build-test -R
		rm -rf build-test
	fi
	./autogen.sh
	mkdir -p build-test
	cd build-test
}

check_make_distcheck() {
	echo 1..1 > make_distcheck.tap

	# If the gcc version on the host is older than 4.8.5, don't run
	# due to a compiler bug that reproduces when building with gtest
	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61886
	if (echo "4.8.5"; gcc --version | head -1 | awk '{print $3}') | sort -CV
	then
		echo "==== Testing make distcheck ===="
		make_clean && make_clean distclean
		../contrib/configure-release --prefix=$PWD/install
		$MAKEP DISTCHECK_CONFIGURE_FLAGS="--enable-gtest" distcheck
	else
		log_warning "Not testing make distcheck: GCC version is too old ($(gcc --version|head -1))"
	fi
}

#
# Expands a CPU list such as "0-3,17" to "0 1 2 3 17" (each cpu in a new line)
#
expand_cpulist() {
	cpulist=$1
	tokens=$(echo ${cpulist} | tr ',' ' ')
	for token in ${tokens}
	do
		# if there is no '-', first and last would be equal
		first=$(echo ${token} | cut -d'-' -f1)
		last=$( echo ${token} | cut -d'-' -f2)

		for ((cpu=${first};cpu<=${last};++cpu))
		do
			echo ${cpu}
		done
	done
}

#
# Get the N'th CPU that the current process can run on
#
slice_affinity() {
	set +x
	n=$1

	# get affinity mask of the current process
	compact_cpulist=$($AFFINITY bash -c 'taskset -cp $$' | cut -d: -f2)
	cpulist=$(expand_cpulist ${compact_cpulist})

	echo "${cpulist}" | head -n $((n + 1)) | tail -1
	set -x
}

#
# `rename` has a binary and Perl flavors. Ubuntu comes with Perl one and
# requires different usage.
#
rename_files() {
	expr=$1; shift
	replacement=$1; shift
	files=$*
	if rename --version | grep 'util-linux'; then
		rename "${expr}" "${replacement}" $files
		return
	fi

	rename "s/\\${expr}\$/${replacement}/" "${files}"
}

run_loopback_app() {
	test_exe=$1
	test_args="-l $2"

	affinity=$(slice_affinity 0)

	taskset -c $affinity ${test_exe} ${test_args} &
	pid=$!

	wait ${pid} || true
}


run_client_server_app() {
	test_exe=$1
	test_args=$2
	server_addr_arg=$3
	kill_server=$4
	error_emulation=$5

	server_port_arg="-p $server_port"
	server_port=$((server_port + 1))

	affinity_server=$(slice_affinity 0)
	affinity_client=$(slice_affinity 1)

	taskset -c $affinity_server ${test_exe} ${test_args} ${server_port_arg} &
	server_pid=$!

	sleep 15

	if [ $error_emulation -eq 1 ]
	then
		set +Ee
	fi

	taskset -c $affinity_client ${test_exe} ${test_args} ${server_addr_arg} ${server_port_arg} &
	client_pid=$!

	wait ${client_pid}

	if [ $error_emulation -eq 1 ]
	then
		set -eE
	fi

	if [ $kill_server -eq 1 ]
	then
		kill -9 ${server_pid}
	fi
	wait ${server_pid} || true
}

run_hello() {
	api=$1
	shift
	test_args="$@"
	test_name=${api}_hello_world

	if [ ! -x ${test_name} ]
	then
		$MAKEP -C examples ${test_name}
	fi

	# set smaller timeouts so the test will complete faster
	if [[ ${test_args} =~ "-e" ]]
	then
		export UCX_UD_TIMEOUT=15s
		export UCX_RC_TIMEOUT=1ms
		export UCX_RC_RETRY_COUNT=4
	fi

	if [[ ${test_args} =~ "-e" ]]
	then
		error_emulation=1
	else
		error_emulation=0
	fi

	run_client_server_app "./examples/${test_name}" "${test_args}" "-n $(hostname)" 0 ${error_emulation}

	if [[ ${test_args} == *"-e"* ]]
	then
		unset UCX_UD_TIMEOUT
		unset UCX_RC_TIMEOUT
		unset UCX_RC_RETRY_COUNT
	fi
}

#
# Compile and run UCP hello world example
#
run_ucp_hello() {
	if ./src/tools/info/ucx_info -e -u twe|grep ERROR
	then
		return # skip if cannot create ucp ep
	fi

	mem_types_list="host "

	if [ "X$have_cuda" == "Xyes" ]
	then
		mem_types_list+="cuda cuda-managed "
	fi

	export UCX_KEEPALIVE_INTERVAL=1s
	export UCX_KEEPALIVE_NUM_EPS=10
	export UCX_LOG_LEVEL=info
	export UCX_MM_ERROR_HANDLING=y

	for tls in all tcp,cuda shm,cuda
	do
		export UCX_TLS=${tls}
		for test_mode in -w -f -b -erecv -esend -ekeepalive
		do
			for mem_type in $mem_types_list
			do
				echo "==== Running UCP hello world with mode ${test_mode} and \"${mem_type}\" memory type ===="
				run_hello ucp ${test_mode} -m ${mem_type}
			done
		done
	done
	rm -f ./ucp_hello_world

	unset UCX_KEEPALIVE_INTERVAL
	unset UCX_KEEPALIVE_NUM_EPS
	unset UCX_LOG_LEVEL
	unset UCX_TLS
	unset UCX_MM_ERROR_HANDLING
}

#
# Compile and run UCT hello world example
#
run_uct_hello() {
	mem_types_list="host "

	if [ "X$have_cuda" == "Xyes" ] && [ -f "/sys/kernel/mm/memory_peers/nv_mem/version" ]
	then
		mem_types_list+="cuda-managed "
		if [ -f "/sys/kernel/mm/memory_peers/nv_mem/version" ]
		then
			# test RDMA GPUDirect
			mem_types_list+="cuda "
		fi
	fi

	for send_func in -i -b -z
	do
		for ucx_dev in $(get_active_ib_devices)
		do
			for mem_type in $mem_types_list
			do
				echo "==== Running UCT hello world server on rc/${ucx_dev} with sending ${send_func} and \"${mem_type}\" memory type ===="
				run_hello uct -d ${ucx_dev} -t "rc_verbs" ${send_func} -m ${mem_type}
			done
		done
		for ucx_dev in $(get_active_ip_ifaces)
		do
			echo "==== Running UCT hello world server on tcp/${ucx_dev} with sending ${send_func} ===="
			run_hello uct -d ${ucx_dev} -t "tcp" ${send_func}
		done
	done
	rm -f ./uct_hello_world
}

run_client_server() {
	test_name=ucp_client_server

	mem_types_list="host"

	if [ "X$have_cuda" == "Xyes" ]
	then
		mem_types_list+=" cuda cuda-managed "
	fi

	if [ ! -x ${test_name} ]
	then
		$MAKEP -C examples ${test_name}
	fi

	server_ip=$1
	if [ "$server_ip" == "" ]
	then
		return
	fi

	for mem_type in ${mem_types_list}
	do
		echo "==== Running UCP client-server with \"${mem_type}\" memory type ===="
		run_client_server_app "./examples/${test_name}" "-m ${mem_type}" "-a ${server_ip}" 1 0
	done
}

run_ucp_client_server() {
	echo "==== Running UCP client-server  ===="
	run_client_server $(get_rdma_device_ip_addr)
	run_client_server $(get_non_rdma_ip_addr)
	run_client_server "127.0.0.1"
}

run_io_demo() {
	server_rdma_addr=$(get_rdma_device_ip_addr)
	server_nonrdma_addr=$(get_non_rdma_ip_addr)
	mem_types_list="host "
	config_args=""

	if [ "X$have_cuda" == "Xyes" ]
	then
		mem_types_list+="cuda cuda-managed "
		config_args+="--with-iodemo-cuda"
	fi

	if [ -z "$server_rdma_addr" ] && [ -z "$server_nonrdma_addr" ]
	then
		return
	fi

	../contrib/configure-devel --prefix=$ucx_inst $config_args
	$MAKEP
	$MAKEP install

	for mem_type in $mem_types_list
	do
		echo "==== Running UCP IO demo with \"${mem_type}\" memory type ===="

		test_args="$@ -o write,read -d 128:4194304 -P 2 -i 10000 -w 10 -m ${mem_type} -q"
		test_name=io_demo

		for server_ip in $server_rdma_addr $server_nonrdma_addr
		do
			run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "${server_ip}" 1 0
		done

		if [ "${mem_type}" == "host" ]
		then
			run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "127.0.0.1" 1 0
		fi
	done

	make_clean
}

#
# Run UCX performance test
# Note: If requested running with MPI, MPI has to be initialized before
# The function accepts 0 (default value) or 1 that means launching w/ or w/o MPI
#
run_ucx_perftest() {
	if [ $# -eq 0 ]
	then
		with_mpi=0
	else
		with_mpi=$1
	fi
	ucx_inst_ptest=$ucx_inst/share/ucx/perftest

	# hack for perftest, no way to override params used in batch
	# todo: fix in perftest
	sed -s 's,-n [0-9]*,-n 100,g' $ucx_inst_ptest/msg_pow2 | sort -R >  $ucx_inst_ptest/msg_pow2_short
	cat $ucx_inst_ptest/test_types_uct                     | sort -R >  $ucx_inst_ptest/test_types_short_uct
	cat $ucx_inst_ptest/test_types_ucp     | grep -v cuda  | sort -R >  $ucx_inst_ptest/test_types_short_ucp
	cat $ucx_inst_ptest/test_types_ucp_rma | grep -v cuda  | sort -R >> $ucx_inst_ptest/test_types_short_ucp

	ucx_perftest="$ucx_inst/bin/ucx_perftest"
	uct_test_args="-b $ucx_inst_ptest/test_types_short_uct \
				-b $ucx_inst_ptest/msg_pow2_short -w 1"

	ucp_test_args="-b $ucx_inst_ptest/test_types_short_ucp \
				-b $ucx_inst_ptest/msg_pow2_short -w 1"

	# IP ifaces
	ip_ifaces=$(get_active_ip_ifaces)

	# shared memory, IB devices, IP ifaces
	devices="memory $(get_active_ib_devices) ${ip_ifaces}"

	# Run on all devices
	my_devices=$(get_my_tasks $devices)
	for ucx_dev in $my_devices
	do
		if [[ $ucx_dev =~ .*mlx5.* ]]; then
			opt_transports="-b $ucx_inst_ptest/transports"
			tls=`awk '{print $3 }' $ucx_inst_ptest/transports | tr '\n' ',' | sed -r 's/,$//; s/mlx5/x/g'`
			dev=$ucx_dev
		elif [[ $ucx_dev =~ memory ]]; then
			opt_transports="-x posix"
			tls="shm"
			dev="all"
		elif [[ " ${ip_ifaces[*]} " == *" ${ucx_dev} "* ]]; then
			opt_transports="-x tcp"
			tls="tcp"
			dev=$ucx_dev
		else
			opt_transports="-x rc_verbs"
			tls="rc_v"
			dev=$ucx_dev
		fi

		echo "==== Running ucx_perf kit on $ucx_dev ===="
		if [ $with_mpi -eq 1 ]
		then
			# Run UCP performance test
			$MPIRUN -np 2 -x UCX_NET_DEVICES=$dev -x UCX_TLS=$tls $AFFINITY $ucx_perftest $ucp_test_args

			# Run UCP loopback performance test
			$MPIRUN -np 1 -x UCX_NET_DEVICES=$dev -x UCX_TLS=$tls $AFFINITY $ucx_perftest $ucp_test_args "-l"
		else
			export UCX_NET_DEVICES=$dev
			export UCX_TLS=$tls

			# Run UCT performance test
			run_client_server_app "$ucx_perftest" "$uct_test_args -d ${ucx_dev} ${opt_transports}" \
								"$(hostname)" 0 0

			# Run UCP performance test
			run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0

			# Run UCP performance test with 2 threads
			run_client_server_app "$ucx_perftest" "$ucp_test_args -T 2" "$(hostname)" 0 0

			# Run UCP loopback performance test
			run_loopback_app "$ucx_perftest" "$ucp_test_args"

			unset UCX_NET_DEVICES
			unset UCX_TLS
		fi
	done

	# run cuda tests if cuda module was loaded and GPU is found, and only in
	# client/server mode, to reduce testing time
	if [ "X$have_cuda" == "Xyes" ] && [ $with_mpi -ne 1 ]
	then
		gdr_options="n "
		if (lsmod | grep -q "nv_peer_mem")
		then
			echo "GPUDirectRDMA module (nv_peer_mem) is present.."
			gdr_options+="y "
		fi

		if [ $num_gpus -gt 1 ]; then
			export CUDA_VISIBLE_DEVICES=$(($worker%$num_gpus)),$(($(($worker+1))%$num_gpus))
		fi

		cat $ucx_inst_ptest/test_types_ucp | grep cuda | sort -R > $ucx_inst_ptest/test_types_short_ucp
		sed -s 's,-n [0-9]*,-n 10 -w 1,g' $ucx_inst_ptest/msg_pow2 | sort -R > $ucx_inst_ptest/msg_pow2_short

		echo "==== Running ucx_perf with cuda memory ===="

		for memtype_cache in y n
		do
			for gdr in $gdr_options
			do
				export UCX_MEMTYPE_CACHE=$memtype_cache
				export UCX_IB_GPU_DIRECT_RDMA=$gdr
				run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
				unset UCX_MEMTYPE_CACHE
				unset UCX_IB_GPU_DIRECT_RDMA
			done
		done

		export UCX_TLS=self,shm,cma,cuda_copy
		run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
		unset UCX_TLS

		# Specifically test cuda_ipc for large message sizes
		cat $ucx_inst_ptest/test_types_ucp | grep -v cuda | sort -R > $ucx_inst_ptest/test_types_cuda_ucp
		ucp_test_args_large="-b $ucx_inst_ptest/test_types_cuda_ucp \
			             -b $ucx_inst_ptest/msg_pow2_large -w 1"
		for ipc_cache in y n
		do
			export UCX_TLS=self,sm,cuda_copy,cuda_ipc
			export UCX_CUDA_IPC_CACHE=$ipc_cache
			run_client_server_app "$ucx_perftest" "$ucp_test_args_large" "$(hostname)" 0 0
			unset UCX_CUDA_IPC_CACHE
			unset UCX_TLS
		done

		echo "==== Running ucx_perf with cuda memory and new protocols ===="

		# Add RMA tests to the list of tests
		cat $ucx_inst_ptest/test_types_ucp_rma | grep cuda | sort -R >> $ucx_inst_ptest/test_types_short_ucp

		export UCX_PROTO_ENABLE=y
		run_client_server_app "$ucx_perftest" "$ucp_test_args" "$(hostname)" 0 0
		unset UCX_PROTO_ENABLE

		unset CUDA_VISIBLE_DEVICES
	fi
}

#
# Test malloc hooks with mpi
#
test_malloc_hooks_mpi() {
	for mode in reloc bistro
	do
		for tname in malloc_hooks malloc_hooks_unmapped external_events flag_no_install
		do
			echo "==== Running memory hook (${tname} mode ${mode}) on MPI ===="
			$MPIRUN -np 1 $AFFINITY \
				./test/mpi/test_memhooks -t $tname -m ${mode}
		done

		echo "==== Running memory hook (malloc_hooks mode ${mode}) on MPI with LD_PRELOAD ===="
		ucm_lib=$PWD/src/ucm/.libs/libucm.so
		ls -l $ucm_lib
		$MPIRUN -np 1 -x LD_PRELOAD=$ucm_lib $AFFINITY \
			./test/mpi/test_memhooks -t malloc_hooks -m ${mode}
	done
}

#
# Run tests with MPI library
#
run_mpi_tests() {
	echo "1..2" > mpi_tests.tap
	if module_load hpcx-gcc && mpirun --version
	then
		# Prevent our tests from using UCX libraries from hpcx module by prepending
		# our local library path first
		export LD_LIBRARY_PATH=${ucx_inst}/lib:$LD_LIBRARY_PATH

		../contrib/configure-release --prefix=$ucx_inst --with-mpi # TODO check in -devel mode as well
		make_clean
		$MAKEP install
		$MAKEP installcheck # check whether installation is valid (it compiles examples at least)

		MPIRUN="mpirun \
				--bind-to none \
				-x UCX_ERROR_SIGNALS \
				-x UCX_HANDLE_ERRORS \
				-mca pml ob1 \
				-mca btl tcp,self \
				-mca btl_tcp_if_include lo \
				-mca orte_allowed_exit_without_sync 1 \
				-mca coll ^hcoll,ml"

		run_ucx_perftest 1
		echo "ok 1 - ucx perftest" >> mpi_tests.tap

		test_malloc_hooks_mpi
		echo "ok 2 - malloc hooks" >> mpi_tests.tap

		make_clean distclean

		module unload hpcx-gcc
	else
		echo "==== Not running MPI tests ===="
		echo "ok 1 - # SKIP because MPI not installed" >> mpi_tests.tap
		echo "ok 2 - # SKIP because MPI not installed" >> mpi_tests.tap
	fi
}

build_ucx_profiling() {
	# compile the profiling example code
	gcc -o ucx_profiling ../test/apps/profiling/ucx_profiling.c \
		-lm -lucs -I${ucx_inst}/include -L${ucx_inst}/lib -Wl,-rpath=${ucx_inst}/lib
}

#
# Test profiling infrastructure
#
test_profiling() {
	echo "==== Running profiling example  ===="

	# configure release mode, application profiling should work
	../contrib/configure-release --prefix=$ucx_inst
	make_clean
	$MAKEP
	$MAKEP install

	build_ucx_profiling

	UCX_PROFILE_MODE=log UCX_PROFILE_FILE=ucx_jenkins.prof ./ucx_profiling

	UCX_READ_PROFILE=${ucx_inst}/bin/ucx_read_profile
	$UCX_READ_PROFILE -r ucx_jenkins.prof | grep "printf" -C 20
	$UCX_READ_PROFILE -r ucx_jenkins.prof | grep -q "calc_pi"
	$UCX_READ_PROFILE -r ucx_jenkins.prof | grep -q "print_pi"
}

test_ucs_load() {
	if [ -z "${have_ptrace}" ]
	then
		log_warning "==== Not running UCS library loading test ===="
		return
	fi

	../contrib/configure-release --prefix=$ucx_inst
	make_clean
	$MAKEP
	$MAKEP install

	build_ucx_profiling

	# Make sure UCS library constructor does not call socket()
	echo "==== Running UCS library loading test ===="
	strace ./ucx_profiling &> strace.log
	! grep '^socket' strace.log
}

test_ucs_dlopen() {
	$MAKEP

	# Make sure UCM is not unloaded
	echo "==== Running UCS dlopen test with memhooks ===="
	./test/apps/test_ucs_dlopen

	# Test global config list integrity after loading/unloading of UCT
	echo "==== Running test_dlopen_cfg_print ===="
	./test/apps/test_dlopen_cfg_print
}

test_ucp_dlopen() {
	../contrib/configure-release --prefix=$ucx_inst
	make_clean
	$MAKEP
	$MAKEP install

	# Make sure UCP library, when opened with dlopen(), loads CMA module
	LIB_CMA=`find ${ucx_inst} -name libuct_cma.so.0`
	if [ -n "$LIB_CMA" ]
	then
		echo "==== Running UCP library loading test ===="
		./test/apps/test_ucp_dlopen | grep 'cma'
	else
		echo "==== Not running UCP library loading test ===="
	fi

	# Test module allow-list
	UCX_MODULES=^ib,rdmacm ./src/tools/info/ucx_info -d |& tee ucx_info_noib.log
	if grep -in "component:\s*ib$" ucx_info_noib.log
	then
		echo "IB module was loaded even though it was disabled"
		exit 1
	fi

	# Test module allow-list passed through ucp_config_modify()
	./test/apps/test_ucp_config -c "UCX_MODULES=^ib,rdmacm" |& tee ucx_config_noib.log
	if grep -in "component:\s*ib$" ucx_config_noib.log
	then
		echo "IB module was loaded even though it was disabled"
		exit 1
	fi
}

test_init_mt() {
	echo "==== Running multi-thread init ===="
	# Each thread requires 5MB. Cap threads number by total available shared memory.
	max_threads=$(df /dev/shm | awk '/shm/ {printf "%d", $4 / 5000}')
	num_threads=$(($max_threads < $(nproc) ? $max_threads : $(nproc)))
	$MAKEP
	for ((i=0;i<10;++i))
	do
		OMP_NUM_THREADS=$num_threads $AFFINITY timeout 5m ./test/apps/test_init_mt
	done
}

test_memtrack() {
	../contrib/configure-devel --prefix=$ucx_inst
	make_clean
	$MAKEP

	echo "==== Running memtrack test ===="
	UCX_MEMTRACK_DEST=stdout ./test/gtest/gtest --gtest_filter=test_memtrack.sanity
}

test_memtrack_limit() {
	../contrib/configure-devel --prefix=$ucx_inst
	make_clean
	$MAKEP

	echo "==== Running memtrack limit test ===="
	UCX_MEMTRACK_DEST=stdout UCX_HANDLE_ERRORS=none UCX_MEMTRACK_LIMIT=512MB ./test/apps/test_memtrack_limit |& grep -C 100 'SUCCESS'
	UCX_MEMTRACK_DEST=stdout UCX_HANDLE_ERRORS=none UCX_MEMTRACK_LIMIT=412MB ./test/apps/test_memtrack_limit |& grep -C 100 'reached'
}

test_unused_env_var() {
	# We must create a UCP worker to get the warning about unused variables
	echo "==== Running ucx_info env vars test ===="
	UCX_IB_PORTS=mlx5_0:1 ./src/tools/info/ucx_info -epw -u t | grep "unused" | grep -q -E "UCX_IB_PORTS"
}

test_env_var_aliases() {
	echo "==== Running MLX5 env var aliases test ===="
	if [[ `./src/tools/info/ucx_info -b | grep -P 'HW_TM *1$'` ]]
	then
		vars=( "TM_ENABLE" "TM_LIST_SIZE" "TX_MAX_BB" )
		for var in "${vars[@]}"
		do
			for tl in "RC_MLX5" "DC_MLX5"
			do
				val=$(./src/tools/info/ucx_info -c | grep "${tl}_${var}" | cut -d'=' -f2)
				if [ -z $val ]
				then
					echo "UCX_${tl}_${var} does not exist in UCX config"
					exit 1
				fi
				# To check that changing env var takes an effect,
				# create some value, which is different from the default.
				magic_val=`echo $val | sed -e ' s/inf\|auto/15/; s/n/swap/; s/y/n/; s/swap/y/; s/\([0-9]\)/\11/'`

				# Check that both (tl name and common RC) aliases work
				for var_alias in "RC" $tl
				do
					var_name=UCX_${var_alias}_${var}
					val_set=$(export $var_name=$magic_val; ./src/tools/info/ucx_info -c | grep "${tl}_${var}" | cut -d'=' -f2)
					if [ "$val_set" != "$magic_val" ]
					then
						echo "Can't set $var_name"
						exit 1
					fi
				done
			done
		done
	else
		echo "HW TM is not compiled in UCX"
	fi
}

test_malloc_hook() {
	echo "==== Running malloc hooks test ===="
	if [ -x ./test/apps/test_tcmalloc ]
	then
		./test/apps/test_tcmalloc
	fi

	if [ "X$have_cuda" == "Xyes" ]
	then
		cuda_dynamic_exe=./test/apps/test_cuda_hook_dynamic
		cuda_static_exe=./test/apps/test_cuda_hook_static

		for mode in reloc bistro
		do
			export UCX_MEM_CUDA_HOOK_MODE=${mode}

			# Run cuda memory hooks with dynamic link
			${cuda_dynamic_exe}

			# Run cuda memory hooks with static link, if exists. If the static
			# library 'libcudart_static.a' is not present, static test will not
			# be built.
			if [ -x ${cuda_static_exe} ]
			then
				${cuda_static_exe} && status="pass" || status="fail"
				[ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail"
				if [ ${status} == ${exp_status} ]
				then
					echo "Static link with cuda ${status}, as expected"
				else
					echo "Static link with cuda is expected to ${exp_status}, actual: ${status}"
					exit 1
				fi
			fi

			# Test that driver API hooks work in both reloc and bistro modes,
			# since we call them directly from the test
			${cuda_dynamic_exe} -d
			[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d

			# Test hooks in gtest
			UCX_MEM_LOG_LEVEL=diag \
				./test/gtest/gtest --gtest_filter='cuda_hooks.*'

			unset UCX_MEM_CUDA_HOOK_MODE
		done
	fi
}

run_gtest_watchdog_test() {
	watchdog_timeout=$1
	sleep_time=$2
	expected_runtime=$3
	expected_err_str="Connection timed out - abort testing"

	make -C test/gtest

	start_time=`date +%s`

	env WATCHDOG_GTEST_TIMEOUT_=$watchdog_timeout \
		WATCHDOG_GTEST_SLEEP_TIME_=$sleep_time \
		GTEST_FILTER=test_watchdog.watchdog_timeout \
		./test/gtest/gtest 2>&1 | tee watchdog_timeout_test &
	pid=$!
	wait $pid

	end_time=`date +%s`

	res="$(grep -x "$expected_err_str" watchdog_timeout_test)" || true

	rm -f watchdog_timeout_test

	if [ "$res" != "$expected_err_str" ]
	then
		echo "didn't find [$expected_err_str] string in the test output"
		exit 1
	fi

	runtime=$(($end_time-$start_time))

	if [ $runtime -gt $expected_runtime ]
	then
		echo "Watchdog timeout test takes $runtime seconds that" \
			"is greater than expected $expected_runtime seconds"
		exit 1
	fi
}

#
# Run the test suite (gtest)
# Arguments: <compiler-name> [configure-flags]
#
run_gtest() {
	compiler_name=$1
	shift
	../contrib/configure-devel --prefix=$ucx_inst $@
	make_clean
	$MAKEP

	echo "==== Running watchdog timeout test, $compiler_name compiler ===="
	run_gtest_watchdog_test 5 60 300

	export GTEST_SHARD_INDEX=$worker
	export GTEST_TOTAL_SHARDS=$nworkers
	export GTEST_RANDOM_SEED=0
	export GTEST_SHUFFLE=1
	export GTEST_TAP=2
	export GTEST_REPORT_DIR=$WORKSPACE/reports/tap
	# Run UCT tests for TCP over fastest device only
	export GTEST_UCT_TCP_FASTEST_DEV=1
	# Report TOP-20 longest test at the end of testing
	export GTEST_REPORT_LONGEST_TESTS=20
	export OMP_NUM_THREADS=4

	if [ $num_gpus -gt 0 ]; then
		export CUDA_VISIBLE_DEVICES=$(($worker%$num_gpus))
	fi

	GTEST_EXTRA_ARGS=""
	if [ "$JENKINS_TEST_PERF" == 1 ]
	then
		# Check performance with 10 retries and 2 seconds interval
		GTEST_EXTRA_ARGS="$GTEST_EXTRA_ARGS -p 10 -i 2.0"
	fi
	export GTEST_EXTRA_ARGS

	mkdir -p $GTEST_REPORT_DIR

	echo "==== Running unit tests, $compiler_name compiler ===="
	$AFFINITY $TIMEOUT make -C test/gtest test
	(cd test/gtest && rename_files .tap _gtest.tap *.tap && mv *.tap $GTEST_REPORT_DIR)

	echo "==== Running malloc hooks mallopt() test, $compiler_name compiler ===="
	# gtest returns with non zero exit code if there were no
	# tests to run. As a workaround run a single test on every
	# shard.
	$AFFINITY $TIMEOUT \
		env UCX_IB_RCACHE=n \
		MALLOC_TRIM_THRESHOLD_=-1 \
		MALLOC_MMAP_THRESHOLD_=-1 \
		GTEST_SHARD_INDEX=0 \
		GTEST_TOTAL_SHARDS=1 \
		GTEST_FILTER=malloc_hook_cplusplus.mallopt \
		make -C test/gtest test
	(cd test/gtest && rename_files .tap _mallopt_gtest.tap malloc_hook_cplusplus.tap && mv *.tap $GTEST_REPORT_DIR)

	echo "==== Running malloc hooks mmap_ptrs test with MMAP_THRESHOLD=16384, $compiler_name compiler ===="
	$AFFINITY $TIMEOUT \
		env MALLOC_MMAP_THRESHOLD_=16384 \
		GTEST_SHARD_INDEX=0 \
		GTEST_TOTAL_SHARDS=1 \
		GTEST_FILTER=malloc_hook_cplusplus.mmap_ptrs \
		make -C test/gtest test
	(cd test/gtest && rename_files .tap _mmap_ptrs_gtest.tap malloc_hook_cplusplus.tap && mv *.tap $GTEST_REPORT_DIR)

	if ! [[ $(uname -m) =~ "aarch" ]] && ! [[ $(uname -m) =~ "ppc" ]] && \
	   ! [[ -n "${JENKINS_NO_VALGRIND}" ]]
	then
		echo "==== Running valgrind tests, $compiler_name compiler ===="

		# Load newer valgrind if naative is older than 3.10
		if ! (echo "valgrind-3.10.0"; valgrind --version) | sort -CV
		then
			module load tools/valgrind-3.12.0
		fi

		$AFFINITY $TIMEOUT_VALGRIND make -C test/gtest test_valgrind
		(cd test/gtest && rename_files .tap _vg.tap *.tap && mv *.tap $GTEST_REPORT_DIR)
		module unload tools/valgrind-3.12.0
	else
		echo "==== Not running valgrind tests with $compiler_name compiler ===="
		echo "1..1"                                          > vg_skipped.tap
		echo "ok 1 - # SKIP because running on $(uname -m)" >> vg_skipped.tap
	fi

	unset OMP_NUM_THREADS
	unset GTEST_UCT_TCP_FASTEST_DEV
	unset GTEST_SHARD_INDEX
	unset GTEST_TOTAL_SHARDS
	unset GTEST_RANDOM_SEED
	unset GTEST_SHUFFLE
	unset GTEST_TAP
	unset GTEST_REPORT_DIR
	unset GTEST_EXTRA_ARGS
	unset CUDA_VISIBLE_DEVICES
}

run_gtest_default() {
	run_gtest "default"
}

run_gtest_armclang() {
	if module_load arm-compiler/arm-hpc-compiler && armclang -v
	then
		# armclang has some old go compiler, disabling go build.
		run_gtest "armclang" CC=armclang CXX=armclang++ --with-go=no
		module unload arm-compiler/arm-hpc-compiler
	else
		echo "==== Not running with armclang compiler ===="
		echo "1..1"                                          > armclang_skipped.tap
		echo "ok 1 - # SKIP because armclang not found"     >> armclang_skipped.tap
	fi
}


#
# Run the test suite (gtest) in release configuration
#
run_gtest_release() {

	echo "1..1" > gtest_release.tap

	../contrib/configure-release --prefix=$ucx_inst --enable-gtest
	make_clean
	$MAKEP

	export GTEST_SHARD_INDEX=0
	export GTEST_TOTAL_SHARDS=1
	export GTEST_RANDOM_SEED=0
	export GTEST_SHUFFLE=1
	export GTEST_TAP=2
	export GTEST_REPORT_DIR=$WORKSPACE/reports/tap
	export OMP_NUM_THREADS=4

	echo "==== Running unit tests (release configuration) ===="
	# Check:
	# - Important object sizes
	# - Unexpected RNDV test, to cover rkey handling in tag offload flow
	#   (see GH #3827 for details)
	env GTEST_FILTER=\*test_obj_size\*:\*test_ucp_tag_match.rndv_rts_unexp\* \
		$AFFINITY $TIMEOUT make -C test/gtest test
	echo "ok 1" >> gtest_release.tap

	unset OMP_NUM_THREADS
	unset GTEST_SHARD_INDEX
	unset GTEST_TOTAL_SHARDS
	unset GTEST_RANDOM_SEED
	unset GTEST_SHUFFLE
	unset GTEST_TAP
	unset GTEST_REPORT_DIR
}

run_ucx_info() {
	echo "==== Running ucx_info ===="

	./src/tools/info/ucx_info -s -f -c -v -y -d -b -p -w -e -uart -m 20M -T -M
}

run_ucx_tl_check() {

	echo "1..1" > ucx_tl_check.tap

	# Test transport selection
	../test/apps/test_ucx_tls.py -p $ucx_inst

	# Test setting many lanes
	UCX_IB_NUM_PATHS=8 \
		UCX_MAX_EAGER_LANES=4 \
		UCX_MAX_RNDV_LANES=4 \
		./src/tools/info/ucx_info -u t -e

	if [ $? -ne 0 ]; then
		echo "not ok 1" >> ucx_tl_check.tap
	else
		echo "ok 1" >> ucx_tl_check.tap
	fi
}

#
# Run all tests
#
run_tests() {
	export UCX_HANDLE_ERRORS=freeze,bt
	export UCX_ERROR_SIGNALS=SIGILL,SIGSEGV,SIGBUS,SIGFPE,SIGPIPE,SIGABRT
	export UCX_ERROR_MAIL_TO=$ghprbActualCommitAuthorEmail
	export UCX_ERROR_MAIL_FOOTER=$JOB_URL/$BUILD_NUMBER/console
	export UCX_TCP_PORT_RANGE="$((33000 + EXECUTOR_NUMBER * 100))"-"$((34000 + EXECUTOR_NUMBER * 100))"
	export UCX_TCP_CM_REUSEADDR=y
	export UCX_IB_ROCE_LOCAL_SUBNET=y

	# load cuda env only if GPU available for remaining tests
	try_load_cuda_env

	# all are running mpi tests
	run_mpi_tests

	if module_load dev/jdk && module_load dev/mvn
	then
		../contrib/configure-devel --prefix=$ucx_inst --with-java
	else
		../contrib/configure-devel --prefix=$ucx_inst
	fi
	$MAKEP
	$MAKEP install

	do_distributed_task 1 4 run_ucx_info
	do_distributed_task 2 4 run_ucx_tl_check
	do_distributed_task 1 4 run_ucp_hello
	do_distributed_task 2 4 run_uct_hello
	do_distributed_task 1 4 run_ucp_client_server
	do_distributed_task 2 4 run_ucx_perftest
	do_distributed_task 2 4 run_io_demo
	do_distributed_task 3 4 test_profiling
	do_distributed_task 1 4 test_ucs_dlopen
	do_distributed_task 3 4 test_ucs_load
	do_distributed_task 3 4 test_memtrack
	do_distributed_task 3 4 test_memtrack_limit
	do_distributed_task 0 4 test_unused_env_var
	do_distributed_task 2 4 test_env_var_aliases
	do_distributed_task 1 4 test_malloc_hook
	do_distributed_task 0 4 test_ucp_dlopen
	do_distributed_task 1 4 test_init_mt

	# all are running gtest
	run_gtest_default
	run_gtest_armclang

	do_distributed_task 1 4 run_gtest_release
}

prepare
try_load_cuda_env
if [ -n "$JENKINS_RUN_TESTS" ] || [ -n "$RUN_TESTS" ]
then
	check_machine
	run_tests
fi