/** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. * * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #include "uct_test.h" #include extern "C" { #include } #define MB pow(1024, -2) #define UCT_PERF_TEST_MULTIPLIER 5 #define UCT_ARM_PERF_TEST_MULTIPLIER 15 class test_uct_perf : public uct_test, public test_perf { protected: const static test_spec tests[]; }; const test_perf::test_spec test_uct_perf::tests[] = { { "am short latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5, 0 }, { "am short rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "am short rate64", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 64 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "am short iov latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 4, 4 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5, 0 }, { "am short iov rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 4, 4 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "am short iov rate64", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 32, 32 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "am bcopy latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5}, { "am bcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 1000 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 15000.0, 0 }, { "am zcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0, 0 }, { "am zcopy bw flush ep", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0, UCX_PERF_TEST_FLAG_FLUSH_EP }, { "put latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 1.5, 0 }, { "put rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0, 0 }, { "put bcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 2048 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0, 0 }, { "put zcopy bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 2048 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0, 0 }, { "get latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic add latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_PINGPONG, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic add rate", "Mpps", UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu, ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.5, 50.0, 0 }, { "atomic fadd latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic cswap latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "atomic swap latency", "usec", UCX_PERF_API_UCT, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu, ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5, 0 }, { "am iov bw", "MB/sec", UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI, UCX_PERF_WAIT_MODE_POLL, UCT_PERF_DATA_LAYOUT_ZCOPY, 8192, 3, { 256, 256, 512 }, 32, 100000lu, ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0, 0 }, { NULL } }; UCS_TEST_P(test_uct_perf, envelope) { if (has_transport("ugni_udt")) { UCS_TEST_SKIP; } /* For SandyBridge CPUs, don't check performance of far-socket devices */ std::vector cpus = get_affinity(); bool check_perf = true; size_t max_iter = std::numeric_limits::max(); if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_INTEL_SANDYBRIDGE) { for (std::vector::iterator iter = cpus.begin(); iter != cpus.end(); ++iter) { if (!ucs_cpu_is_set(*iter, &GetParam()->local_cpus)) { UCS_TEST_MESSAGE << "Not enforcing performance on SandyBridge far socket"; check_perf = false; break; } } } if (has_transport("tcp")) { check_perf = false; /* TODO calibrate expected performance based on transport */ max_iter = 1000lu; } /* Run all tests */ for (const test_spec *test_iter = tests; test_iter->title != NULL; ++test_iter) { test_spec test = *test_iter; if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) { test.max *= UCT_ARM_PERF_TEST_MULTIPLIER; test.min /= UCT_ARM_PERF_TEST_MULTIPLIER; } else { test.max *= UCT_PERF_TEST_MULTIPLIER; test.min /= UCT_PERF_TEST_MULTIPLIER; } test.iters = ucs_min(test.iters, max_iter); run_test(test, 0, check_perf, GetParam()->tl_name, GetParam()->dev_name); } } UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_perf);