/** * \file dnn/test/cuda/dct.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "megdnn/oprs/nn.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/dct_ref.h" #include "test/common/rng.h" #include "test/cuda/fixture.h" namespace megdnn { namespace test { TEST_F(CUDA, DCT) { DctChannelSelectForward::Param param; Checker checker(handle_cuda()); for (size_t n : {1, 3}) { for (size_t ic : {1, 3}) { for (size_t ih : {8, 16, 32, 512, 1024}) { for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) { checker.set_param(param) .set_dtype(0, dtype::Uint8()) .set_dtype(1, dtype::Int32()) .set_dtype(2, dtype::Int32()) .execs({TensorShape{n, ic, ih, iw}, {}, {}, {}}); } } } } } TEST_F(CUDA, DCT_QINT8) { DctChannelSelectForward::Param param; Checker checker(handle_cuda()); param.format = Param::Format::NCHW4; for (size_t n : {1, 3}) { for (size_t ic : {1, 3}) { for (size_t ih : {8, 16, 32, 512, 1024}) { for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) { checker.set_param(param) .set_dtype(0, dtype::Uint8()) .set_dtype(1, dtype::Int32()) .set_dtype(2, dtype::Int32()) .set_dtype(3, dtype::QuantizedS8(10.f)) .set_epsilon(1) .execs({TensorShape{n, ic, ih, iw}, {}, {}, {}}); } } } } } TEST_F(CUDA, DCT_WITH_FIX_32_MASK) { using Param = DctChannelSelectForward::Param; Param param; Checker checker(handle_cuda(), false); param.fastImpl = Param::FastImpl::FIX_32_MASK; auto test_case = gen_dct_case(3, 3, 1024, 768, 32, param); checker.set_param(param).exect(test_case->testcase_in, test_case->testcase_out); } TEST_F(CUDA, DCT_WITH_FIX_32_MASK_QINT8) { using Param = DctChannelSelectForward::Param; Param param; Checker checker(handle_cuda(), false); param.fastImpl = Param::FastImpl::FIX_32_MASK; param.format = Param::Format::NCHW4; auto test_case = gen_dct_case(3, 3, 1024, 768, 32, param, dtype::QuantizedS8(10.f)); checker.set_param(param).set_epsilon(1).exect( test_case->testcase_in, test_case->testcase_out); } TEST_F(CUDA, DCT_WITH_MASK) { Checker checker(handle_cuda(), false); DctChannelSelectForward::Param param; checker.set_param(param).exect( Testcase{ TensorValue( {1, 3, 8, 16}, dtype::Uint8(), {109, 39, 30, 115, 71, 15, 206, 139, 221, 5, 18, 16, 93, 185, 99, 102, 205, 172, 191, 29, 185, 6, 47, 84, 0, 47, 105, 203, 251, 73, 196, 83, 3, 211, 32, 181, 49, 111, 114, 83, 148, 232, 77, 17, 35, 2, 154, 100, 41, 135, 141, 206, 56, 91, 137, 199, 104, 192, 75, 122, 78, 65, 184, 69, 91, 82, 2, 172, 194, 240, 49, 145, 87, 210, 97, 190, 179, 93, 125, 105, 181, 207, 148, 178, 133, 53, 25, 198, 238, 151, 14, 120, 213, 195, 145, 20, 122, 107, 217, 185, 65, 5, 115, 110, 82, 206, 163, 86, 2, 2, 44, 125, 50, 38, 41, 106, 30, 5, 151, 243, 238, 181, 232, 191, 161, 57, 23, 204, 109, 39, 30, 115, 71, 15, 206, 139, 221, 5, 18, 16, 93, 185, 99, 102, 205, 172, 191, 29, 185, 6, 47, 84, 0, 47, 105, 203, 251, 73, 196, 83, 3, 211, 32, 181, 49, 111, 114, 83, 148, 232, 77, 17, 35, 2, 154, 100, 41, 135, 141, 206, 56, 91, 137, 199, 104, 192, 75, 122, 78, 65, 184, 69, 91, 82, 2, 172, 194, 240, 49, 145, 87, 210, 97, 190, 179, 93, 125, 105, 181, 207, 148, 178, 133, 53, 25, 198, 238, 151, 14, 120, 213, 195, 145, 20, 122, 107, 217, 185, 65, 5, 115, 110, 82, 206, 163, 86, 2, 2, 44, 125, 50, 38, 41, 106, 30, 5, 151, 243, 238, 181, 232, 191, 161, 57, 23, 204, 109, 39, 30, 115, 71, 15, 206, 139, 221, 5, 18, 16, 93, 185, 99, 102, 205, 172, 191, 29, 185, 6, 47, 84, 0, 47, 105, 203, 251, 73, 196, 83, 3, 211, 32, 181, 49, 111, 114, 83, 148, 232, 77, 17, 35, 2, 154, 100, 41, 135, 141, 206, 56, 91, 137, 199, 104, 192, 75, 122, 78, 65, 184, 69, 91, 82, 2, 172, 194, 240, 49, 145, 87, 210, 97, 190, 179, 93, 125, 105, 181, 207, 148, 178, 133, 53, 25, 198, 238, 151, 14, 120, 213, 195, 145, 20, 122, 107, 217, 185, 65, 5, 115, 110, 82, 206, 163, 86, 2, 2, 44, 125, 50, 38, 41, 106, 30, 5, 151, 243, 238, 181, 232, 191, 161, 57, 23, 204}), TensorValue({4}, dtype::Int32(), {0, 14, 22, 30}), TensorValue( {30}, dtype::Int32(), {8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 0, 1, 8, 16, 9, 2, 3, 10, 0, 1, 8, 16, 9, 2, 3, 10}), {}}, Testcase{ {}, {}, {}, TensorValue( {1, 30, 1, 2}, dtype::Float32(), {-22.850792, -97.862236, -101.043236, -4.727012, 28.275675, -157.96654, 42.1377, 45.06531, -149.77373, 24.487143, -8.054966, -13.990831, -6.9395194, -3.9211385, 64.79172, -12.363858, -47.875, 59., 56.271786, -62.725567, 120.522675, 16.559765, 85.74334, 112.904495, 99.375, 29.499973, 2.0220923, -19.681704, 890.12494, 941.25, -7.0498576, 99.47632, -22.850792, -97.862236, -101.043236, -4.727012, 28.275675, -157.96654, 42.1377, 45.06531, -149.77373, 24.487143, -8.054966, -13.990831, 890.12494, 941.25, -7.0498576, 99.47632, -22.850792, -97.862236, -101.043236, -4.727012, 28.275675, -157.96654, 42.1377, 45.06531, -149.77373, 24.487143, -8.054966, -13.990831})}); } TEST_F(CUDA, DCT_WITH_MASK2) { Checker checker(handle_cuda(), false); DctChannelSelectForward::Param param; UniformIntRNG rng_oc(0, 3 * 64); for (size_t n : {1, 3}) { for (size_t ic : {1, 3}) { for (size_t ih : {8, 16, 32, 512, 1024}) { for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) { int random_oc = static_cast(rng_oc.gen_single_val()); int max_oc = ic * 64; int mask_oc = (random_oc % max_oc) + 1; auto test_case = gen_dct_case(n, ic, ih, iw, mask_oc, param); checker.set_param(param).exect( test_case->testcase_in, test_case->testcase_out); } } } } } TEST_F(CUDA, DCT_WITH_MASK2_QINT8) { Checker checker(handle_cuda(), false); DctChannelSelectForward::Param param; param.format = DctChannelSelectForward::Param::Format::NCHW4; UniformIntRNG rng_oc(0, 3 * 64); for (size_t n : {1, 3}) { for (size_t ic : {1, 3}) { for (size_t ih : {8, 16, 32, 512, 1024}) { for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) { int random_oc = static_cast(rng_oc.gen_single_val()); int max_oc = ic * 64; int mask_oc = (random_oc % max_oc) + 1; mask_oc = (mask_oc + 3) / 4 * 4; auto test_case = gen_dct_case( n, ic, ih, iw, mask_oc, param, dtype::QuantizedS8(10.f)); checker.set_param(param).set_epsilon(1).exect( test_case->testcase_in, test_case->testcase_out); } } } } } TEST_F(CUDA, DCT_WITH_MASK2_QINT8_CONSTRAINT) { DctChannelSelectForward::Param param; param.format = DctChannelSelectForward::Param::Format::NCHW4; Checker checker(handle_cuda(), false); checker.set_param(param) .set_dtype(0, dtype::Uint8()) .set_dtype(1, dtype::Int32()) .set_dtype(2, dtype::Int32()) .set_dtype(3, dtype::QuantizedS8(10.f)) .set_epsilon(1); UniformIntRNG rng_oc(0, 3 * 64); for (size_t n : {1, 3}) { for (size_t ic : {1, 3}) { for (size_t ih : {8, 16, 32, 512, 1024}) { for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) { int random_oc = static_cast(rng_oc.gen_single_val()); int max_oc = ic * 64; int mask_oc = (random_oc % max_oc) + 1; mask_oc = (mask_oc + 3) / 4 * 4; if (mask_oc < max_oc) { checker .set_tensors_constraint(gen_dct_constriant( n, ic, ih, iw, mask_oc, param)) .exec({TensorShape{n, ic, ih, iw}, TensorShape{ic + 1}, TensorShape{(size_t)mask_oc}, {}}); } else { checker.set_tensors_constraint({}).exec( {TensorShape{n, ic, ih, iw}, {}, {}, {}}); } } } } } } #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_DCT) { using Param = DctChannelSelectForward::Param; auto run = [&](const TensorShapeArray& shapes, Param param) { Benchmarker benchmarker(handle_cuda()); benchmarker.set_param(param); benchmarker.set_dtype(0, dtype::Uint8()) .set_dtype(1, dtype::Int32()) .set_dtype(2, dtype::Int32()); for (auto&& shape : shapes) { double computation = double(shape[0]) * shape[1] * shape[2] * shape[3] * 32.0 * 1e-6; auto time_ms = benchmarker.execs({shape, {}, {}, {}}); printf("execute %s, %.4f Gops\n", shape.to_string().c_str(), computation / time_ms); } }; auto run_case = [&](const DctTestcase& testcase, Param param, std::string comment = "") { Benchmarker benchmarker(handle_cuda()); benchmarker.set_param(param); benchmarker.set_dtype(0, dtype::Uint8()) .set_dtype(1, dtype::Int32()) .set_dtype(2, dtype::Int32()) .set_dtype(3, testcase.testcase_out[3].layout.dtype); auto src_shape = testcase.testcase_in[0].layout; double computation = double(src_shape[0]) * src_shape[1] * src_shape[2] * src_shape[3] * 32.0 * 1e-6; auto time_ms = benchmarker.exect(testcase.testcase_in); printf("[%s] execute %s, %.4f Gops\n", comment.c_str(), src_shape.to_string().c_str(), computation / time_ms); }; auto run_case_constraint = [&](const Benchmarker::TensorsConstriant& constraint, Param param, const TensorShapeArray& shapes, std::string comment = "", DType output_dtype) { Benchmarker benchmarker(handle_cuda()); benchmarker.set_param(param) .set_dtype(0, dtype::Uint8()) .set_dtype(1, dtype::Int32()) .set_dtype(2, dtype::Int32()) .set_dtype(3, output_dtype) .set_tensors_constraint(constraint); auto src_shape = shapes[0]; double computation = double(src_shape[0]) * src_shape[1] * src_shape[2] * src_shape[3] * 32.0 * 1e-6; auto time_ms = benchmarker.exec(shapes); printf("[%s] execute %s, %.4f Gops\n", comment.c_str(), src_shape.to_string().c_str(), computation / time_ms); }; TensorShapeArray shapes = { {1, 3, 512, 512}, {8, 3, 2176, 3840}, }; { Param param; run(shapes, param); } Param fix_32_param; fix_32_param.fastImpl = Param::FastImpl::FIX_32_MASK; { auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param); run_case(*test_case, fix_32_param, "FIX_32_MASK"); } { Param param; auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param); run_case(*test_case, param, "MASK 32"); } { Param fix_32_nchw4_param; fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK; fix_32_nchw4_param.format = Param::Format::NCHW4; auto test_case = gen_dct_case( 8, 3, 2176, 3840, 32, fix_32_nchw4_param, dtype::QuantizedS8(10.f)); run_case(*test_case, fix_32_nchw4_param, "FIX_32_MASK QINT8"); } { Param fix_32_nchw4_param; fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK; fix_32_nchw4_param.format = Param::Format::NCHW4; auto test_case = gen_dct_case( 8, 3, 2176, 3840, 32, fix_32_nchw4_param, dtype::QuantizedS8(10.f)); fix_32_nchw4_param.fastImpl = Param::FastImpl::NONE; run_case(*test_case, fix_32_nchw4_param, "MASK 32 QINT8"); } { Param fix_32_nchw4_param; fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK; fix_32_nchw4_param.format = Param::Format::NCHW4; TensorShapeArray shapes = {{8, 3, 2176, 3840}, {4}, {32}, {}}; auto constraint = gen_dct_constriant(8, 3, 2176, 3840, 32, fix_32_nchw4_param); run_case_constraint( constraint, fix_32_nchw4_param, shapes, "FIX_32_MASK QINT8 Constraint", dtype::QuantizedS8(10.f)); } } #endif } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen