/** * \file dnn/test/cuda/cutlass_matmul.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include #include "megdnn/oprs/linalg.h" #include "src/common/utils.h" #include "test/common/checker.h" #include "test/common/matrix_mul.h" #include "test/common/tensor.h" #include "test/common/workspace_wrapper.h" #include "test/cuda/benchmark.h" #include "test/cuda/fixture.h" #include "test/cuda/utils.h" #if CUDA_VERSION >= 9020 namespace megdnn { namespace test { namespace { void test_multibatchsize( Handle* handle_cuda, DType A_dtype, DType B_dtype, DType C_dtype, const char* algo, const std::vector& args, param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT, const std::function& filter = {}) { Checker checker(handle_cuda, false); if (algo) { checker.set_before_exec_callback(AlgoChecker(algo)); } std::unique_ptr rng; if (A_dtype.enumv() == DTypeEnum::Float32) { rng = std::make_unique(-1, 1); megdnn_assert( B_dtype.enumv() == DTypeEnum::Float32 && C_dtype.enumv() == DTypeEnum::Float32); } megdnn_assert(rng != nullptr); struct Compare { bool is_same(dt_float32 expected, dt_float32 actual) const { return expected == actual; } }; // copy rhs->lhs, lhs is 8 times of rhs auto copy = [](SyncedTensor& lhs, SyncedTensor& rhs) { size_t chunk = rhs.layout().span().dist_byte(); size_t tot = lhs.layout().span().dist_byte(); megdnn_assert(tot % chunk == 0); char* pointer_lhs = reinterpret_cast(lhs.ptr_mutable_host()); const char* pointer_rhs = reinterpret_cast(rhs.ptr_host()); for (size_t i = 0; i < tot; i += chunk) { std::memcpy(pointer_lhs + i, pointer_rhs, chunk); } }; using Param = param::MatrixMul; megdnn_assert(format == Param::Format::DEFAULT); for (auto&& arg : args) { megdnn_assert(arg.mask == 0x0); // make m, n, k big enough size_t m = arg.m, n = (arg.n << 3), k = (arg.k << 3); size_t m_prime = (m << 3); if (filter && filter(arg)) continue; TensorShape A{m, k}, B{k, n}, C{m, n}; TensorShape A_prime{m_prime, k}, C_prime{m_prime, n}; SyncedTensor A_tensor{handle_cuda, {A, A_dtype}}, B_tensor{handle_cuda, {B, B_dtype}}, C_tensor{handle_cuda, {C, C_dtype}}, A_tensor_prime{handle_cuda, {A_prime, A_dtype}}, C_tensor_prime{handle_cuda, {C_prime, C_dtype}}, C_tensor_batch{handle_cuda, {C_prime, C_dtype}}; rng->gen(A_tensor.tensornd_host()); rng->gen(B_tensor.tensornd_host()); copy(A_tensor_prime, A_tensor); auto opr_reference = handle_cuda->create_operator(); { opr_reference->execution_policy().algo.reset(); for (auto i : opr_reference->get_all_algorithms_info_safe( A_tensor.layout(), B_tensor.layout(), C_tensor.layout())) { if (std::regex_match( i.desc.name.c_str(), std::regex("(" + std::string(algo) + ")(.*)"))) { opr_reference->execution_policy().algo = i.desc; break; } } megdnn_assert(opr_reference->execution_policy().algo.valid()); size_t ws_size = opr_reference->get_workspace_in_bytes( A_tensor.layout(), B_tensor.layout(), C_tensor.layout()); WorkspaceWrapper ws_reference(handle_cuda, ws_size); opr_reference->exec( A_tensor.tensornd_dev(), B_tensor.tensornd_dev(), C_tensor.tensornd_dev(), ws_reference.workspace()); } copy(C_tensor_prime, C_tensor); checker.set_dtype(0, A_dtype) .set_dtype(1, B_dtype) .set_dtype(2, C_dtype) .set_epsilon(1e-6) .exect({A_tensor_prime.tensornd_host(), B_tensor.tensornd_host(), {}}, {{}, {}, C_tensor_prime.tensornd_host()}); { opr_reference->execution_policy().algo.reset(); for (auto i : opr_reference->get_all_algorithms_info_safe( A_tensor_prime.layout(), B_tensor.layout(), C_tensor_batch.layout())) { if (std::regex_match( i.desc.name.c_str(), std::regex("(" + std::string(algo) + ")(.*)"))) { opr_reference->execution_policy().algo = i.desc; break; } } megdnn_assert(opr_reference->execution_policy().algo.valid()); size_t ws_size = opr_reference->get_workspace_in_bytes( A_tensor_prime.layout(), B_tensor.layout(), C_tensor_batch.layout()); WorkspaceWrapper ws_reference(handle_cuda, ws_size); opr_reference->exec( A_tensor_prime.tensornd_dev(), B_tensor.tensornd_dev(), C_tensor_batch.tensornd_dev(), ws_reference.workspace()); } C_tensor_batch.check_with(C_tensor_prime); } } #if MEGDNN_WITH_BENCHMARK struct BenchArgs { size_t m, n, k, mask = 0x0; }; std::vector get_square_matmul_args() { std::vector args; args.emplace_back(BenchArgs{128, 128, 128}); args.emplace_back(BenchArgs{256, 256, 256}); args.emplace_back(BenchArgs{512, 512, 512}); args.emplace_back(BenchArgs{1024, 1024, 1024}); args.emplace_back(BenchArgs{2048, 2048, 2048}); args.emplace_back(BenchArgs{4096, 4096, 4096}); return args; } std::vector get_feat_model_args() { std::vector args; args.emplace_back(BenchArgs{2, 4096, 4096}); args.emplace_back(BenchArgs{2, 1024, 6912}); args.emplace_back(BenchArgs{2, 3456, 3456}); args.emplace_back(BenchArgs{2, 2304, 2304}); args.emplace_back(BenchArgs{1, 256, 8192}); args.emplace_back(BenchArgs{2, 864, 864}); args.emplace_back(BenchArgs{2, 9, 64}); args.emplace_back(BenchArgs{4, 4096, 4096}); args.emplace_back(BenchArgs{4, 1024, 6912}); args.emplace_back(BenchArgs{4, 3456, 3456}); args.emplace_back(BenchArgs{4, 2304, 2304}); args.emplace_back(BenchArgs{2, 256, 8192}); args.emplace_back(BenchArgs{4, 864, 864}); args.emplace_back(BenchArgs{4, 9, 64}); args.emplace_back(BenchArgs{8, 4096, 4096}); args.emplace_back(BenchArgs{8, 1024, 6912}); args.emplace_back(BenchArgs{8, 3456, 3456}); args.emplace_back(BenchArgs{8, 2304, 2304}); args.emplace_back(BenchArgs{4, 256, 8192}); args.emplace_back(BenchArgs{8, 864, 864}); args.emplace_back(BenchArgs{4, 9, 64}); args.emplace_back(BenchArgs{16, 4096, 4096}); args.emplace_back(BenchArgs{16, 1024, 6912}); args.emplace_back(BenchArgs{16, 3456, 3456}); args.emplace_back(BenchArgs{16, 2304, 2304}); args.emplace_back(BenchArgs{8, 256, 8192}); args.emplace_back(BenchArgs{16, 864, 864}); args.emplace_back(BenchArgs{8, 9, 64}); args.emplace_back(BenchArgs{32, 4096, 4096}); args.emplace_back(BenchArgs{32, 1024, 6912}); args.emplace_back(BenchArgs{32, 3456, 3456}); args.emplace_back(BenchArgs{32, 2304, 2304}); args.emplace_back(BenchArgs{16, 256, 8192}); args.emplace_back(BenchArgs{32, 864, 864}); args.emplace_back(BenchArgs{32, 9, 64}); args.emplace_back(BenchArgs{64, 4096, 4096}); args.emplace_back(BenchArgs{64, 1024, 6912}); args.emplace_back(BenchArgs{64, 3456, 3456}); args.emplace_back(BenchArgs{64, 2304, 2304}); args.emplace_back(BenchArgs{32, 256, 8192}); args.emplace_back(BenchArgs{64, 864, 864}); args.emplace_back(BenchArgs{64, 9, 64}); args.emplace_back(BenchArgs{128, 4096, 4096}); args.emplace_back(BenchArgs{128, 1024, 6912}); args.emplace_back(BenchArgs{128, 3456, 3456}); args.emplace_back(BenchArgs{128, 2304, 2304}); args.emplace_back(BenchArgs{64, 256, 8192}); args.emplace_back(BenchArgs{128, 864, 864}); args.emplace_back(BenchArgs{128, 9, 64}); return args; } #if CUDA_VERSION >= 10010 std::vector get_f16_feat_model_args() { std::vector args; args.emplace_back(BenchArgs{128, 9216, 9216}); args.emplace_back(BenchArgs{128, 6400, 6400}); args.emplace_back(BenchArgs{128, 5184, 5184}); return args; } #endif void benchmark_matrix_mul( Handle* handle, const std::vector& args, DType A_dtype, DType B_dtype, DType C_dtype, const char* algo = nullptr, param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT) { megdnn_assert(A_dtype.enumv() == B_dtype.enumv()); CUBenchmarker benchmarker(handle); CUBenchmarker benchmarker_cublas(handle); size_t RUNS = 1000; benchmarker.set_display(false).set_times(RUNS); benchmarker_cublas.set_display(false).set_times(RUNS); benchmarker_cublas.set_before_exec_callback( AlgoChecker("CUBLAS")); benchmarker.set_dtype(0, A_dtype).set_dtype(1, B_dtype).set_dtype(2, C_dtype); benchmarker_cublas.set_dtype(0, A_dtype) .set_dtype(1, B_dtype) .set_dtype(2, C_dtype); using Param = MatrixMul::Param; for (auto&& arg : args) { size_t m = arg.m, n = arg.n, k = arg.k; Param param; param.transposeA = arg.mask & 0x1; param.transposeB = arg.mask & 0x2; param.format = format; size_t A0 = m, A1 = k, B0 = k, B1 = n; if (param.transposeA) { std::swap(A0, A1); } if (param.transposeB) { std::swap(B0, B1); } benchmarker.set_param(param); TensorShape A{A0, A1}, B{B0, B1}, C{m, n}; float time_in_ms = 0.f; if (algo) { time_in_ms = algo_benchmark< MatrixMulForward, OprProxy, CUTimer>( benchmarker, {A, B, C}, algo) / RUNS; } else { time_in_ms = benchmarker.execs({A, B, C}) / RUNS; } benchmarker_cublas.set_param(param); auto time_in_ms_cublas = benchmarker_cublas.execs({A, B, C}) / RUNS; float flo = 2.0 * m * n * k / (1e12); printf("A=%s, B=%s, C=%s, time(algo=%s)=%.2f %.2fTops, " "time(cublas)=%.2f %.2fTops, " "perf(algo=%s)/perf(cublas)=%.2f\n", A.to_string().c_str(), B.to_string().c_str(), C.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cublas, (flo / (time_in_ms_cublas * 1e-3)), algo, time_in_ms_cublas / time_in_ms); } } #endif } // namespace TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) { auto args = matrix_mul::get_matmul_args_no_mask(); test_multibatchsize( handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT_128X128X8_32X64X8", args, param::MatrixMul::Format::DEFAULT); } TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_MULTI_BATCHSIZE) { auto args = matrix_mul::get_matmul_args_no_mask(); test_multibatchsize( handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT_SPLIT_K_128X128X8_32X64X8", args, param::MatrixMul::Format::DEFAULT, [](const matrix_mul::TestArg& arg) { return arg.k <= arg.n; }); } TEST_F(CUDA, CUTLASS_GEMV_BATCHED_STRIDED_128_MULTI_BATCHSIZE) { auto args = matrix_mul::get_matmul_args_no_mask(); test_multibatchsize( handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT_GEMV_BATCHED_STRIDED_128", args, param::MatrixMul::Format::DEFAULT); } TEST_F(CUDA, CUTLASS_GEMV_BATCHED_STRIDED_64_MULTI_BATCHSIZE) { auto args = matrix_mul::get_matmul_args_no_mask(); test_multibatchsize( handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT_GEMV_BATCHED_STRIDED_64", args, param::MatrixMul::Format::DEFAULT); } TEST_F(CUDA, CUTLASS_GEMV_BATCHED_STRIDED_32_MULTI_BATCHSIZE) { auto args = matrix_mul::get_matmul_args_no_mask(); test_multibatchsize( handle_cuda(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT_GEMV_BATCHED_STRIDED_32", args, param::MatrixMul::Format::DEFAULT); } #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 64, 256, 8, 32, 64, 8); \ cb(2, 256, 64, 8, 64, 32, 8); \ cb(3, 32, 256, 8, 16, 64, 8); \ cb(4, 256, 32, 8, 64, 16, 8); \ cb(5, 128, 128, 8, 32, 64, 8); \ cb(6, 128, 64, 8, 64, 32, 8); \ cb(7, 64, 128, 8, 32, 64, 8); \ cb(8, 128, 32, 8, 64, 32, 8); \ cb(9, 32, 128, 8, 32, 64, 8); \ cb(10, 64, 64, 8, 32, 64, 8); \ cb(11, 32, 64, 8, 32, 64, 8); \ cb(12, 64, 32, 8, 64, 32, 8); \ cb(13, 32, 32, 8, 32, 32, 8); \ cb(14, 8, 32, 8, 8, 32, 8); \ cb(15, 16, 32, 8, 16, 32, 8); \ cb(16, 16, 64, 8, 16, 64, 8); \ cb(17, 16, 128, 8, 16, 64, 8); #define cb(name, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CUTLASS_GEMM_##name) { \ matrix_mul::check_matrix_mul( \ dtype::Float32(), dtype::Float32(), dtype::Float32(), handle_cuda(), \ "CUTLASS_FLOAT32_SIMT_" #tbm "X" #tbn "X" #tbk "_" #wm "X" #wn \ "X" #wk); \ } MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #define cb(name, tbm, tbn, tbk, wm, wn, wk) \ TEST_F(CUDA, CUTLASS_GEMM_SPLIT_K_##name) { \ matrix_mul::check_matrix_mul( \ dtype::Float32(), dtype::Float32(), dtype::Float32(), handle_cuda(), \ "CUTLASS_FLOAT32_SIMT_SPLIT_K_" #tbm "X" #tbn "X" #tbk "_" #wm "X" #wn \ "X" #wk, \ param::MatrixMul::Format::DEFAULT, 8, 1e-3, \ matrix_mul::get_matmul_args_split_k()); \ } MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL #if CUDA_VERSION >= 10010 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ cb(3, 128, 128, 32, 64, 64, 32, 8, 8, 4); #define cb(name, tbm, tbn, tbk, wm, wn, wk, im, in, ik) \ TEST_F(CUDA, CUTLASS_F16_884_GEMM_##name) { \ require_compute_capability(7, 0); \ matrix_mul::check_matrix_mul( \ dtype::Float16(), dtype::Float16(), dtype::Float16(), handle_cuda(), \ "CUTLASS_FLOAT16_TENSOR_OP_h" #im #in #ik "_" #tbm "X" #tbn "X" #tbk \ "_" #wm "X" #wn "X" #wk, \ param::MatrixMul::Format::DEFAULT, 8, 1e-2, \ matrix_mul::get_matmul_args()); \ } MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #define cb(name, tbm, tbn, tbk, wm, wn, wk, im, in, ik) \ TEST_F(CUDA, CUTLASS_F16_884_GEMM_SPLIT_K_##name) { \ require_compute_capability(7, 0); \ matrix_mul::check_matrix_mul( \ dtype::Float16(), dtype::Float16(), dtype::Float16(), handle_cuda(), \ "CUTLASS_FLOAT16_TENSOR_OP_SPLIT_K_h" #im #in #ik "_" #tbm "X" #tbn \ "X" #tbk "_" #wm "X" #wn "X" #wk, \ param::MatrixMul::Format::DEFAULT, 8, 1e-3, \ matrix_mul::get_matmul_args_split_k(), true, \ param::MatrixMul::ComputeMode::FLOAT32); \ } MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL #endif #if CUDA_VERSION >= 10020 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \ cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \ cb(3, 128, 128, 32, 64, 64, 32, 16, 8, 8); #define cb(name, tbm, tbn, tbk, wm, wn, wk, im, in, ik) \ TEST_F(CUDA, CUTLASS_F16_1688_GEMM_##name) { \ require_compute_capability(7, 5); \ matrix_mul::check_matrix_mul( \ dtype::Float16(), dtype::Float16(), dtype::Float16(), handle_cuda(), \ "CUTLASS_FLOAT16_TENSOR_OP_h" #im #in #ik "_" #tbm "X" #tbn "X" #tbk \ "_" #wm "X" #wn "X" #wk, \ param::MatrixMul::Format::DEFAULT, 8, 1e-2, \ matrix_mul::get_matmul_args(), true, \ param::MatrixMul::ComputeMode::FLOAT32); \ } MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #define cb(name, tbm, tbn, tbk, wm, wn, wk, im, in, ik) \ TEST_F(CUDA, CUTLASS_F16_1688_GEMM_SPLIT_K_##name) { \ require_compute_capability(7, 5); \ matrix_mul::check_matrix_mul( \ dtype::Float16(), dtype::Float16(), dtype::Float16(), handle_cuda(), \ "CUTLASS_FLOAT16_TENSOR_OP_SPLIT_K_h" #im #in #ik "_" #tbm "X" #tbn \ "X" #tbk "_" #wm "X" #wn "X" #wk, \ param::MatrixMul::Format::DEFAULT, 8, 1e-3, \ matrix_mul::get_matmul_args_split_k()); \ } MEGDNN_FOREACH_CUTLASS_KERNEL(cb) #undef cb #undef MEGDNN_FOREACH_CUTLASS_KERNEL #endif #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) { benchmark_matrix_mul( handle_cuda(), get_square_matmul_args(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT"); } TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) { benchmark_matrix_mul( handle_cuda(), get_feat_model_args(), dtype::Float32(), dtype::Float32(), dtype::Float32(), "CUTLASS_FLOAT32_SIMT"); } #if CUDA_VERSION >= 10010 TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { benchmark_matrix_mul( handle_cuda(), get_f16_feat_model_args(), dtype::Float16(), dtype::Float16(), dtype::Float16(), "CUTLASS_FLOAT16_TENSOR_OP"); } #endif #endif } // namespace test } // namespace megdnn #endif // vim: syntax=cpp.doxygen