/** * \file dnn/test/cuda/pooling.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "test/cuda/fixture.h" #include "megdnn/tensor_iter.h" #include "test/common/checker.h" #include "test/common/pooling.h" #include "src/common/utils.h" #include "test/cuda/utils.h" // to check cudnn version #include #include "test/cuda/benchmark.h" namespace { #define V1(v) #v #define V(v) V1(v) #define DEF_NAME(NAME) \ #NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) } // namespace namespace megdnn { namespace test { TEST_F(CUDA, POOLING_FORWARD) { auto args = pooling::get_args(); using Format = param::Pooling::Format; std::vector dtypes{dtype::Float16(), dtype::BFloat16(), dtype::Float32()}; if (check_compute_capability(6, 0)) { // int pooling is supported only for Pascal or higher dtypes.push_back(dtype::Int8()); } for (auto dtype : dtypes) for (auto format : {Format::NCHW, Format::NHWC}) for (auto&& arg : args) { auto param = arg.param; auto src = arg.ishape; param.format = format; if (param.format == Format::NHWC) { src = cvt_src_or_dst_nchw2nhwc(src); } Checker checker(handle_cuda()); if (dtype == dtype::Int8()) { // different versions of cuDNN differs in rounding behavior; // setting eps to 1 to allow for rounding errors. checker.set_epsilon(1 + 1e-3); } else if (dtype == dtype::BFloat16()) { checker.set_epsilon(2e-2); } else { checker.set_epsilon(1e-2); } checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec( TensorShapeArray{src, {}}); } /* add test for new Mode temporarily */ for (auto dtype : dtypes) for (auto format : {Format::NCHW, Format::NHWC}) for (auto&& arg : args) { auto param = arg.param; if (param.mode == Pooling::Mode::AVERAGE) param.mode = Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; else continue; auto src = arg.ishape; param.format = format; if (param.format == Format::NHWC) { src = cvt_src_or_dst_nchw2nhwc(src); } Checker checker(handle_cuda()); if (dtype == dtype::Int8()) { // different versions of cuDNN differs in rounding behavior; // setting eps to 1 to allow for rounding errors. checker.set_epsilon(1 + 1e-3); } else if (dtype == dtype::BFloat16()) { checker.set_epsilon(2e-2); } else { checker.set_epsilon(1e-2); } checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec( TensorShapeArray{src, {}}); } } TEST_F(CUDA, POOLING_BACKWARD) { auto args = pooling::get_args(); for (auto&& arg : args) { Checker checker(handle_cuda()); TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32()); TensorLayout olayout; auto constraint = [this, arg](CheckerHelper::TensorValueArray& tensors_orig) { megdnn_assert(tensors_orig.size() == 4); auto opr = handle_cuda()->create_operator(); opr->param() = arg.param; auto tensors_cuda_storage = CheckerHelper::alloc_tensors( handle_cuda(), {tensors_orig[0].layout, tensors_orig[1].layout}, 0); auto&& tensors_cuda = *tensors_cuda_storage; auto span = tensors_cuda[0].layout.span(); auto dst = static_cast(tensors_cuda[0].raw_ptr()) + span.low_byte; auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte()); auto workspace_size = opr->get_workspace_in_bytes( tensors_cuda[0].layout, tensors_cuda[1].layout); auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size); Workspace workspace{static_cast(workspace_cuda), workspace_size}; opr->exec(tensors_cuda[0], tensors_cuda[1], workspace); megdnn_free(handle_cuda(), workspace_cuda); span = tensors_cuda[1].layout.span(); dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; src = static_cast(tensors_cuda[1].raw_ptr()) + span.low_byte; megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte()); }; { auto opr = handle_cuda()->create_operator(); opr->param() = arg.param; opr->deduce_layout(ilayout, olayout); } auto set_dtype = [&checker](DType dtype) { checker.set_dtype(0, dtype) .set_dtype(1, dtype) .set_dtype(2, dtype) .set_dtype(3, dtype); }; checker.set_tensors_constraint(constraint); set_dtype(dtype::Float32()); checker.set_param(arg.param).exec( TensorShapeArray{ilayout, olayout, olayout, ilayout}); Float16PeriodicalRNG rng; set_dtype(dtype::Float16()); checker.set_param(arg.param).set_rng(0, &rng).set_epsilon(1e-2).exec( TensorShapeArray{ilayout, olayout, olayout, ilayout}); BFloat16PeriodicalRNG bf16_rng; set_dtype(dtype::BFloat16()); checker.set_param(arg.param) .set_rng(0, &bf16_rng) .set_epsilon(1e-2) .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout}); } /* add test for new Mode temporarily */ for (auto&& arg : args) { if (arg.param.mode == Pooling::Mode::AVERAGE) arg.param.mode = Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; else continue; Checker checker(handle_cuda()); TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32()); TensorLayout olayout; auto constraint = [this, arg](CheckerHelper::TensorValueArray& tensors_orig) { megdnn_assert(tensors_orig.size() == 4); auto opr = handle_cuda()->create_operator(); opr->param() = arg.param; auto tensors_cuda_storage = CheckerHelper::alloc_tensors( handle_cuda(), {tensors_orig[0].layout, tensors_orig[1].layout}, 0); auto&& tensors_cuda = *tensors_cuda_storage; auto span = tensors_cuda[0].layout.span(); auto dst = static_cast(tensors_cuda[0].raw_ptr()) + span.low_byte; auto src = static_cast(tensors_orig[0].raw_ptr()) + span.low_byte; megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte()); auto workspace_size = opr->get_workspace_in_bytes( tensors_cuda[0].layout, tensors_cuda[1].layout); auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size); Workspace workspace{static_cast(workspace_cuda), workspace_size}; opr->exec(tensors_cuda[0], tensors_cuda[1], workspace); megdnn_free(handle_cuda(), workspace_cuda); span = tensors_cuda[1].layout.span(); dst = static_cast(tensors_orig[1].raw_ptr()) + span.low_byte; src = static_cast(tensors_cuda[1].raw_ptr()) + span.low_byte; megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte()); }; { auto opr = handle_cuda()->create_operator(); opr->param() = arg.param; opr->deduce_layout(ilayout, olayout); } auto set_dtype = [&checker](DType dtype) { checker.set_dtype(0, dtype) .set_dtype(1, dtype) .set_dtype(2, dtype) .set_dtype(3, dtype); }; checker.set_tensors_constraint(constraint); set_dtype(dtype::Float32()); checker.set_param(arg.param).exec( TensorShapeArray{ilayout, olayout, olayout, ilayout}); Float16PeriodicalRNG rng; set_dtype(dtype::Float16()); checker.set_param(arg.param).set_rng(0, &rng).set_epsilon(1e-2).exec( TensorShapeArray{ilayout, olayout, olayout, ilayout}); BFloat16PeriodicalRNG bf16_rng; set_dtype(dtype::BFloat16()); checker.set_param(arg.param) .set_rng(0, &bf16_rng) .set_epsilon(1e-2) .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout}); } } TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2}; checker.set_dtype(0, dtype::QuantizedS4(3.1415926f)); param.format = Param::Format::NCHW; checker.set_param(param).exec({{20, 64, 22, 33}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{20, 96, 22, 33}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{20, 24, 22, 33}, {}}); checker.set_dtype(0, dtype::Quantized4Asymm(3.1415926f, 3)); param.format = Param::Format::NCHW; checker.set_param(param).exec({{20, 64, 22, 33}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{20, 96, 22, 33}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{20, 24, 22, 33}, {}}); } TEST_F(CUDA, POOLING_FORWARD_NCHW4_NCHW32) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_epsilon(1 + 1e-3); checker.set_before_exec_callback( AlgoChecker(DEF_NAME(cudnnForward))); for (auto format : {Param::Format::NCHW4, Param::Format::NCHW32}) { param.format = format; param.mode = Param::Mode::MAX; checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{4, 3, 28, 28, 64}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}}); } } #if CUDNN_VERSION >= 7500 TEST_F(CUDA, POOLING_FORWARD_NCHW32) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param; auto i8_min = std::numeric_limits().min(); auto i8_max = std::numeric_limits().max(); UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_before_exec_callback(AlgoChecker("CUDA_NCHW32")); param.format = Param::Format::NCHW32; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); } #endif TEST_F(CUDA, POOLING_FORWARD_NCHW64_Q4) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2}; UniformIntRNG int_rng{-8, 7}; checker.set_dtype(0, dtype::QuantizedS4(1.f)); param.format = Param::Format::NCHW64; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}}); } TEST_F(CUDA, POOLING_FORWARD_NCHW64_U4) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2}; UniformIntRNG int_rng{0, 15}; checker.set_dtype(0, dtype::Quantized4Asymm(1.f, 3)); param.format = Param::Format::NCHW64; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{4, 8, 28, 28, 64}, {}}); } TEST_F(CUDA, POOLING_FORWARD_NHWC_Q4) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2}; UniformIntRNG int_rng{-8, 7}; checker.set_dtype(0, dtype::QuantizedS4(1.f)); param.format = Param::Format::NHWC; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{2, 28, 28, 16}, {}}); checker.set_param(param).exec({{2, 177, 233, 16}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{3, 13, 28, 32}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{4, 29, 28, 64}, {}}); } TEST_F(CUDA, POOLING_FORWARD_NHWC_U4) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param{Param::Mode::MAX, 1, 1, 2, 2, 2, 2}; UniformIntRNG int_rng{0, 15}; checker.set_dtype(0, dtype::Quantized4Asymm(1.f, 3)); param.format = Param::Format::NHWC; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{2, 28, 28, 16}, {}}); checker.set_param(param).exec({{2, 177, 233, 16}, {}}); param.mode = Param::Mode::AVERAGE; checker.set_param(param).exec({{3, 13, 28, 32}, {}}); param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; checker.set_param(param).exec({{4, 29, 28, 64}, {}}); } TEST_F(CUDA, POOLING_FORWARD_CHWN4) { require_compute_capability(6, 1); using Param = param::Pooling; Checker checker(handle_cuda()); Param param; auto i8_min = std::numeric_limits().min(); auto i8_max = std::numeric_limits().max(); UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); param.format = Param::Format::CHWN4; for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { param.mode = mode; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{8, 28, 28, 64, 4}, {}}); checker.set_param(param).exec({{8, 28, 28, 15, 4}, {}}); checker.set_param(param).exec({{8, 28, 28, 30, 4}, {}}); } } TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) { require_compute_capability(6, 1); using Param = param::Pooling; Checker checker(handle_cuda()); Param param; auto i8_min = std::numeric_limits().min(); auto i8_max = std::numeric_limits().max(); UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); param.format = Param::Format::NCHW4; checker.set_before_exec_callback(AlgoChecker("CUDA_NCHW4")); for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { param.mode = mode; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{64, 8, 28, 28, 4}, {}}); checker.set_param(param).exec({{15, 8, 28, 28, 4}, {}}); checker.set_param(param).exec({{30, 8, 28, 28, 4}, {}}); } } TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) { require_compute_capability(6, 1); using Param = param::Pooling; Checker checker(handle_cuda()); Param param; auto i8_min = std::numeric_limits().min(); auto i8_max = std::numeric_limits().max(); UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_before_exec_callback(AlgoChecker("CUDA_NCHW32")); param.format = Param::Format::NCHW32; for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { param.mode = mode; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); checker.set_param(param).exec({{15, 8, 28, 28, 32}, {}}); checker.set_param(param).exec({{30, 8, 28, 28, 32}, {}}); } } #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_POOLING_CHWN4) { CUBenchmarker bencher(handle_cuda()); size_t nr_times = 1000; bencher.set_times(nr_times); using Param = param::Pooling; Param param; auto run_bench = [&](size_t N, size_t C, size_t H, size_t W, size_t stride, size_t padding, size_t window, Param::Mode mode = Param::Mode::MAX) { param.mode = mode; param.pad_h = param.pad_w = padding; param.window_h = param.window_w = window; param.stride_h = param.stride_w = stride; param.format = Param::Format::NCHW4; bencher.set_dtype(0, dtype::QuantizedS8{0.1f}); bencher.set_param(param); auto time_cudnn = bencher.execs({{N, C / 4, H, W, 4}, {}}) / nr_times; param.format = Param::Format::CHWN4; bencher.set_param(param); auto time_chwn4 = bencher.execs({{C / 4, H, W, N, 4}, {}}) / nr_times; auto time_nchw32 = bencher.execs({{N, C / 32, H, W, 32}, {}}) / nr_times; size_t oh = infer_conv_shape(H, window, stride, padding), ow = infer_conv_shape(W, window, stride, padding); float io = (N * C * H * W + N * C * oh * ow) * sizeof(int8_t); printf("time(cudnn)=%.2f ms, time(chwn4)=%.2f ms, time(nchw32)=%.2f " "ms, " "bandwidth(cudnn)=%.2f Gb/s, bandwidth(chwn4)=%.2f Gb/s, " "bandwidth(nchw32)=%.2f Gb/s\n", time_cudnn, time_chwn4, time_nchw32, io / (1e6 * time_cudnn), io / (1e6 * time_chwn4), io / (1e6 * time_nchw32)); }; run_bench(64, 64, 112, 112, 2, 1, 2); run_bench(256, 64, 112, 112, 2, 1, 2); run_bench(64, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE); run_bench(256, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE); run_bench(64, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING); run_bench(256, 64, 112, 112, 2, 1, 2, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING); } #endif } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen