/** * \file dnn/test/arm_common/elemwise_benchmark.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #if MEGDNN_WITH_BENCHMARK #include "test/arm_common/fixture.h" #include "megdnn/oprs.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/rng.h" using namespace megdnn; using namespace test; #define TEST_IN_DIFF_DISTRUBUTION(proportion_of_inf, dataset_number) \ max_val = 88.3762626647949f / (1 - proportion_of_inf); \ UniformFloatRNG rng##dataset_number(0.f, max_val); \ B.set_rng(0, &rng##dataset_number); \ B.execs({{355600}, {}}); TEST_F(ARM_COMMON, BENCHMARK_ELEM_UNARY_FLOATONLY) { Benchmarker B(handle()); using Mode = ElemwiseForward::Param::Mode; // UniformFloatWithZeroRNG rng(80, 100, 0.1); printf("Test Optr exp(x)\n"); B.set_param(Mode::EXP); B.execs({{355600}, {}}); B.set_param(Mode::EXP); B.execs({{355600}, {}}); float max_val = 0; TEST_IN_DIFF_DISTRUBUTION(0.25, 1) TEST_IN_DIFF_DISTRUBUTION(0.5, 2) TEST_IN_DIFF_DISTRUBUTION(0.75, 3) TEST_IN_DIFF_DISTRUBUTION(0.9999, 4) printf("Test Optr tanh(x)\n"); B.set_param(Mode::TANH); B.execs({{355600}, {}}); B.set_param(Mode::TANH); B.execs({{355600}, {}}); max_val = 0; TEST_IN_DIFF_DISTRUBUTION(0.25, 5) TEST_IN_DIFF_DISTRUBUTION(0.5, 6) TEST_IN_DIFF_DISTRUBUTION(0.75, 7) TEST_IN_DIFF_DISTRUBUTION(0.9999, 8) printf("Test Optr fast_tanh(x)\n"); B.set_param(Mode::FAST_TANH); B.execs({{355600}, {}}); printf("Test Optr sigmoid(x)\n"); B.set_param(Mode::SIGMOID); B.execs({{355600}, {}}); TEST_IN_DIFF_DISTRUBUTION(0.25, 9) TEST_IN_DIFF_DISTRUBUTION(0.5, 10) TEST_IN_DIFF_DISTRUBUTION(0.75, 11) TEST_IN_DIFF_DISTRUBUTION(0.9999, 12) B.set_param(Mode::SIGMOID); B.execs({{355600}, {}}); max_val = 0; TEST_IN_DIFF_DISTRUBUTION(0.25, 13) TEST_IN_DIFF_DISTRUBUTION(0.5, 14) TEST_IN_DIFF_DISTRUBUTION(0.75, 15) TEST_IN_DIFF_DISTRUBUTION(0.9999, 16) } TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY) { Benchmarker B(handle()); using Mode = ElemwiseForward::Param::Mode; const size_t RUN_TIMES = 10; B.set_times(RUN_TIMES).set_display(false); auto run_unary = [&](const TensorShape& shape, param::Elemwise::Mode mode, const char* mode_str, DType dtype) { B.set_param(mode).set_dtype(0, dtype); float time = B.execs({shape, {}}) / RUN_TIMES; float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f); printf("%s(%s):\tlayout(%s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str, dtype.name(), shape.to_string().c_str(), time, computations * dtype.size() / time * 1e3); }; #define RUN(shape, mode, dtype) run_unary(shape, mode, #mode, dtype); #define BENCHMARK_CASES_INT(shape, dtype) \ RUN(shape, Mode::RELU, dtype) \ RUN(shape, Mode::ABS, dtype) #define BENCHMARK_CASES_FLOAT(shape, dtype) \ BENCHMARK_CASES_INT(shape, dtype) \ RUN(shape, Mode::SIGMOID, dtype) \ RUN(shape, Mode::EXP, dtype) \ RUN(shape, Mode::TANH, dtype) \ RUN(shape, Mode::FAST_TANH, dtype) TensorShape shape = {10, 50, 10, 100}; BENCHMARK_CASES_INT(shape, dtype::Int32()); BENCHMARK_CASES_INT(shape, dtype::Int16()); BENCHMARK_CASES_INT(shape, dtype::Int8()); BENCHMARK_CASES_FLOAT(shape, dtype::Float32()); #undef BENCHMARK_CASES_INT #undef BENCHMARK_CASES_FLOAT #undef RUN } TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY_MULTI_TYPE) { Benchmarker B(handle()); using Mode = ElemwiseMultiType::Param::Mode; const size_t RUN_TIMES = 20; B.set_times(RUN_TIMES).set_display(false); auto run_unary = [&](const TensorShape& shape, Mode mode, const char* mode_str, DType src_dtype, DType dst_dtype) { B.set_param(mode).set_dtype(0, src_dtype).set_dtype(1, dst_dtype); float time = B.execs({shape, {}}) / RUN_TIMES; float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f); printf("type %s %s(%s) to %s \ttime(%fms)\tbandwidth(%fGBps)\n", mode_str, src_dtype.name(), shape.to_string().c_str(), dst_dtype.name(), time, computations * src_dtype.size() / time * 1e3); }; #define RUN(shape, mode, src_dtype, dst_dtye) \ run_unary(shape, mode, #mode, src_dtype, dst_dtye); #define BENCHMARK_CASES_INT(shape, src_dtype, dst_dtye) \ RUN(shape, Mode::QRELU, src_dtype, dst_dtye) \ RUN(shape, Mode::QABS, src_dtype, dst_dtye) TensorShape shape = {10, 50, 10, 100}; BENCHMARK_CASES_INT(shape, dtype::QuantizedS32(62.5f), dtype::QuantizedS8(2.5f)); #undef BENCHMARK_CASES_INT #undef BENCHMARK_CASES_FLOAT #undef RUN } TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_BINARY) { Benchmarker B(handle()); using Mode = ElemwiseForward::Param::Mode; const size_t RUN_TIMES = 10; B.set_times(RUN_TIMES).set_display(false); auto run_binary = [&](const TensorShape& shape0, const TensorShape& shape1, param::Elemwise::Mode mode, const char* mode_str, DType dtype) { B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype); float time = B.execs({shape0, shape1, {}}) / RUN_TIMES; float bandwidth = (shape0.total_nr_elems() + shape1.total_nr_elems() + std::max(shape0.total_nr_elems(), shape1.total_nr_elems())) / (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3; printf("%s(%s):\tlayout(%s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str, dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(), time, bandwidth); }; #define RUN(shape0, shape1, mode, dtype) run_binary(shape0, shape1, mode, #mode, dtype); #define BENCHMARK_CASES_INT(shape0, shape1, dtype) \ RUN(shape0, shape1, Mode::ADD, dtype) \ RUN(shape0, shape1, Mode::MIN, dtype) \ RUN(shape0, shape1, Mode::MAX, dtype) \ RUN(shape0, shape1, Mode::SUB, dtype) \ RUN(shape0, shape1, Mode::MUL, dtype) \ RUN(shape0, shape1, Mode::FUSE_ADD_RELU, dtype) #define BENCHMARK_CASES_FLOAT(shape0, shape1, dtype) \ BENCHMARK_CASES_INT(shape0, shape1, dtype) \ RUN(shape0, shape1, Mode::TRUE_DIV, dtype) \ RUN(shape0, shape1, Mode::FUSE_ADD_SIGMOID, dtype) \ RUN(shape0, shape1, Mode::FUSE_ADD_TANH, dtype) #define BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1) \ BENCHMARK_CASES_INT(shape0, shape1, dtype::Int32()); \ BENCHMARK_CASES_INT(shape0, shape1, dtype::Int16()); \ BENCHMARK_CASES_INT(shape0, shape1, dtype::Int8()); \ BENCHMARK_CASES_FLOAT(shape0, shape1, dtype::Float32()); TensorShape shape0 = {10, 50, 10, 100}; TensorShape shape1 = {10, 50, 10, 100}; BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1); shape1 = {1, 50, 1, 1}; BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1); shape1 = {1, 1, 1, 1}; BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1); #undef BENCHMARK_CASES_EVERY_DTYPE #undef BENCHMARK_CASES_FLOAT #undef BENCHMARK_CASES_INT #undef RUN } TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_TERNARY) { Benchmarker B(handle()); using Mode = ElemwiseForward::Param::Mode; const size_t RUN_TIMES = 10; B.set_times(RUN_TIMES).set_display(false); auto run_ternary = [&](const TensorShape& shape0, const TensorShape& shape1, const TensorShape& shape2, param::Elemwise::Mode mode, const char* mode_str, DType dtype) { B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype); float time = B.execs({shape0, shape1, shape2, {}}) / RUN_TIMES; float bandwidth = (shape0.total_nr_elems() * 2 + shape1.total_nr_elems() + shape2.total_nr_elems()) / (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3; printf("%s(%s):\tlayout(%s %s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str, dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(), shape2.to_string().c_str(), time, bandwidth); }; TensorShape shape = {10, 50, 10, 100}; run_ternary( shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int32()); run_ternary( shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int16()); run_ternary( shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int8()); run_ternary( shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32()); #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC run_ternary( shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32()); run_ternary( shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16()); run_ternary( {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32()); run_ternary( {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16()); #endif } #endif