/** * \file example/cpp_example/basic.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include #include "example.h" #if LITE_BUILD_WITH_MGE #include #include "misc.h" using namespace lite; using namespace example; namespace { void output_info(std::shared_ptr network, size_t output_size) { for (size_t index = 0; index < output_size; index++) { printf("output[%zu] names %s \n", index, network->get_all_output_name()[index].c_str()); std::shared_ptr output_tensor = network->get_output_tensor(index); size_t ndim = output_tensor->get_layout().ndim; for (size_t i = 0; i < ndim; i++) { printf("output[%zu] tensor.shape[%zu] %zu \n", index, i, output_tensor->get_layout().shapes[i]); } } } void output_data_info(std::shared_ptr network, size_t output_size) { for (size_t index = 0; index < output_size; index++) { auto output_tensor = network->get_output_tensor(index); void* out_data = output_tensor->get_memory_ptr(); size_t out_length = output_tensor->get_tensor_total_size_in_byte() / output_tensor->get_layout().get_elem_size(); LiteDataType dtype = output_tensor->get_layout().data_type; float max = -1000.0f; float min = 1000.0f; int max_idx = 0; int min_idx = 0; float sum = 0.0f; #define cb(_dtype, _real_dtype) \ case LiteDataType::_dtype: { \ for (size_t i = 0; i < out_length; i++) { \ _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \ sum += data; \ if (max < data) { \ max = data; \ max_idx = i; \ } \ if (min > data) { \ min = data; \ min_idx = i; \ } \ } \ } break; switch (dtype) { cb(LITE_FLOAT, float); cb(LITE_INT, int); cb(LITE_INT8, int8_t); cb(LITE_UINT8, uint8_t); default: printf("unknow datatype"); } printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, " "sum=%e\n", out_length, index, max, max_idx, min, min_idx, sum); } #undef cb } } // namespace namespace { bool basic_load_from_path(const Args& args) { set_log_level(LiteLogLevel::DEBUG); std::string network_path = args.model_path; std::string input_path = args.input_path; //! create and load the network std::shared_ptr network = std::make_shared(); network->load_model(network_path); //! set input data to input tensor std::shared_ptr input_tensor = network->get_input_tensor(0); auto layout = input_tensor->get_layout(); for (size_t i = 0; i < layout.ndim; i++) { printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]); } //! copy or forward data to network size_t length = input_tensor->get_tensor_total_size_in_byte(); void* dst_ptr = input_tensor->get_memory_ptr(); auto src_tensor = parse_npy(input_path); auto layout0 = src_tensor->get_layout(); for (size_t i = 0; i < layout0.ndim; i++) { printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]); } void* src = src_tensor->get_memory_ptr(); memcpy(dst_ptr, src, length); //! forward { lite::Timer ltimer("warmup"); network->forward(); network->wait(); ltimer.print_used_time(0); } lite::Timer ltimer("forward_iter"); for (int i = 0; i < 10; i++) { network->forward(); network->wait(); ltimer.print_used_time(i); } //! forward { lite::Timer ltimer("warmup"); network->forward(); network->wait(); ltimer.print_used_time(0); } for (int i = 0; i < 10; i++) { ltimer.reset_start(); network->forward(); network->wait(); ltimer.print_used_time(i); } //! get the output data or read tensor set in network_in size_t output_size = network->get_all_output_name().size(); output_info(network, output_size); output_data_info(network, output_size); return true; } bool basic_load_from_path_with_loader(const Args& args) { set_log_level(LiteLogLevel::DEBUG); lite::set_loader_lib_path(args.loader_path); std::string network_path = args.model_path; std::string input_path = args.input_path; //! create and load the network std::shared_ptr network = std::make_shared(); network->load_model(network_path); //! set input data to input tensor std::shared_ptr input_tensor = network->get_input_tensor(0); auto input_layout = input_tensor->get_layout(); //! copy or forward data to network auto src_tensor = parse_npy(input_path); auto src_layout = src_tensor->get_layout(); if (src_layout.ndim != input_layout.ndim) { printf("src dim is not equal model input dim\n"); } //! pay attention the input shape can change for (size_t i = 0; i < input_layout.ndim; i++) { if (input_layout.shapes[i] != src_layout.shapes[i]) { printf("src shape not equal input shape"); } } input_tensor->set_layout(src_tensor->get_layout()); //! reset or forward data to network input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout()); //! forward network->forward(); network->wait(); //! forward { lite::Timer ltimer("warmup"); network->forward(); network->wait(); ltimer.print_used_time(0); } lite::Timer ltimer("forward_iter"); for (int i = 0; i < 10; i++) { ltimer.reset_start(); network->forward(); network->wait(); ltimer.print_used_time(i); } //! get the output data or read tensor set in network_in size_t output_size = network->get_all_output_name().size(); output_info(network, output_size); output_data_info(network, output_size); return true; } bool basic_load_from_memory(const Args& args) { std::string network_path = args.model_path; std::string input_path = args.input_path; //! create and load the network std::shared_ptr network = std::make_shared(); FILE* fin = fopen(network_path.c_str(), "rb"); if (!fin) { printf("failed to open %s.", network_path.c_str()); } fseek(fin, 0, SEEK_END); size_t size = ftell(fin); fseek(fin, 0, SEEK_SET); void* ptr = malloc(size); std::shared_ptr buf{ptr, ::free}; auto len = fread(buf.get(), 1, size, fin); if (len < 1) { printf("read file failed.\n"); } fclose(fin); network->load_model(buf.get(), size); //! set input data to input tensor std::shared_ptr input_tensor = network->get_input_tensor(0); //! copy or forward data to network size_t length = input_tensor->get_tensor_total_size_in_byte(); void* dst_ptr = input_tensor->get_memory_ptr(); auto src_tensor = parse_npy(input_path); void* src = src_tensor->get_memory_ptr(); memcpy(dst_ptr, src, length); //! forward network->forward(); network->wait(); //! get the output data or read tensor set in network_in std::shared_ptr output_tensor = network->get_output_tensor(0); void* out_data = output_tensor->get_memory_ptr(); size_t out_length = output_tensor->get_tensor_total_size_in_byte() / output_tensor->get_layout().get_elem_size(); printf("length=%zu\n", length); float max = -1.0f; float sum = 0.0f; for (size_t i = 0; i < out_length; i++) { float data = static_cast(out_data)[i]; sum += data; if (max < data) max = data; } printf("max=%e, sum=%e\n", max, sum); return true; } bool async_forward(const Args& args) { std::string network_path = args.model_path; std::string input_path = args.input_path; Config config; config.options.var_sanity_check_first_run = false; //! create and load the network std::shared_ptr network = std::make_shared(config); network->load_model(network_path); //! set input data to input tensor std::shared_ptr input_tensor = network->get_input_tensor(0); //! copy or forward data to network size_t length = input_tensor->get_tensor_total_size_in_byte(); void* dst_ptr = input_tensor->get_memory_ptr(); auto src_tensor = parse_npy(input_path); void* src = src_tensor->get_memory_ptr(); memcpy(dst_ptr, src, length); //! set async mode and callback volatile bool finished = false; network->set_async_callback([&finished]() { #if !__DEPLOY_ON_XP_SP2__ std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl; #endif finished = true; }); #if !__DEPLOY_ON_XP_SP2__ std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl; #endif //! forward network->forward(); size_t count = 0; while (finished == false) { count++; } printf("Forward finish, count is %zu\n", count); //! get the output data or read tensor set in network_in std::shared_ptr output_tensor = network->get_output_tensor(0); void* out_data = output_tensor->get_memory_ptr(); size_t out_length = output_tensor->get_tensor_total_size_in_byte() / output_tensor->get_layout().get_elem_size(); printf("length=%zu\n", length); float max = -1.0f; float sum = 0.0f; for (size_t i = 0; i < out_length; i++) { float data = static_cast(out_data)[i]; sum += data; if (max < data) max = data; } printf("max=%e, sum=%e\n", max, sum); return true; } bool set_input_callback(const Args& args) { std::string network_path = args.model_path; std::string input_path = args.input_path; Config config; config.options.var_sanity_check_first_run = false; //! create and load the network std::shared_ptr network = std::make_shared(config); network->load_model(network_path); //! set input data to input tensor std::shared_ptr input_tensor = network->get_input_tensor(0); //! copy or forward data to network size_t length = input_tensor->get_tensor_total_size_in_byte(); void* dst_ptr = input_tensor->get_memory_ptr(); auto src_tensor = parse_npy(input_path); void* src = src_tensor->get_memory_ptr(); memcpy(dst_ptr, src, length); //! set input callback volatile bool finished = false; network->set_start_callback( [&finished](const std::unordered_map< std::string, std::pair>>& inputs) { #if !__DEPLOY_ON_XP_SP2__ std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl; #endif for (auto&& item : inputs) { std::cout << "input name: " << item.first << "input dim: " << item.second.second->get_layout().ndim << std::endl; } finished = true; }); #if !__DEPLOY_ON_XP_SP2__ std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl; #endif //! forward network->forward(); size_t count = 0; while (finished == false) { count++; } printf("Forward finish, count is %zu\n", count); //! get the output data or read tensor set in network_in std::shared_ptr output_tensor = network->get_output_tensor(0); void* out_data = output_tensor->get_memory_ptr(); size_t out_length = output_tensor->get_tensor_total_size_in_byte() / output_tensor->get_layout().get_elem_size(); printf("length=%zu\n", length); float max = -1.0f; float sum = 0.0f; for (size_t i = 0; i < out_length; i++) { float data = static_cast(out_data)[i]; sum += data; if (max < data) max = data; } printf("max=%e, sum=%e\n", max, sum); return true; } bool set_output_callback(const Args& args) { std::string network_path = args.model_path; std::string input_path = args.input_path; Config config; config.options.var_sanity_check_first_run = false; //! create and load the network std::shared_ptr network = std::make_shared(config); network->load_model(network_path); //! set input data to input tensor std::shared_ptr input_tensor = network->get_output_tensor(0); //! copy or forward data to network size_t length = input_tensor->get_tensor_total_size_in_byte(); void* dst_ptr = input_tensor->get_memory_ptr(); auto src_tensor = parse_npy(input_path); void* src = src_tensor->get_memory_ptr(); memcpy(dst_ptr, src, length); //! set output callback volatile bool finished = false; network->set_finish_callback( [&finished](const std::unordered_map< std::string, std::pair>>& outputs) { #if !__DEPLOY_ON_XP_SP2__ std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl; #endif for (auto&& item : outputs) { std::cout << "output name: " << item.first << "output dim: " << item.second.second->get_layout().ndim << std::endl; } finished = true; }); #if !__DEPLOY_ON_XP_SP2__ std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl; #endif //! forward network->forward(); network->wait(); size_t count = 0; while (finished == false) { count++; } printf("Forward finish, count is %zu\n", count); //! get the output data or read tensor set in network_in std::shared_ptr output_tensor = network->get_output_tensor(0); void* out_data = output_tensor->get_memory_ptr(); size_t out_length = output_tensor->get_tensor_total_size_in_byte() / output_tensor->get_layout().get_elem_size(); printf("length=%zu\n", length); float max = -1.0f; float sum = 0.0f; for (size_t i = 0; i < out_length; i++) { float data = static_cast(out_data)[i]; sum += data; if (max < data) max = data; } printf("max=%e, sum=%e\n", max, sum); return true; } } // namespace REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path); REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader); REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory); REGIST_EXAMPLE("async_forward", async_forward); REGIST_EXAMPLE("set_input_callback", set_input_callback); REGIST_EXAMPLE("set_output_callback", set_output_callback); #if LITE_WITH_CUDA namespace { bool load_from_path_run_cuda(const Args& args) { std::string network_path = args.model_path; std::string input_path = args.input_path; set_log_level(LiteLogLevel::DEBUG); //! config the network running in CUDA device lite::Config config{false, -1, LiteDeviceType::LITE_CUDA}; //! set NetworkIO NetworkIO network_io; std::string input_name = "img0_comp_fullface"; bool is_host = false; IO device_input{input_name, is_host}; network_io.inputs.push_back(device_input); //! create and load the network std::shared_ptr network = std::make_shared(config, network_io); network->load_model(network_path); std::shared_ptr input_tensor = network->get_input_tensor(0); Layout input_layout = input_tensor->get_layout(); //! read data from numpy data file auto src_tensor = parse_npy(input_path); //! malloc the device memory auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); //! copy to the device memory tensor_device.copy_from(*src_tensor); //! Now the device memory if filled with user input data, set it to the //! input tensor input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); //! forward { lite::Timer ltimer("warmup"); network->forward(); network->wait(); ltimer.print_used_time(0); } lite::Timer ltimer("forward_iter"); for (int i = 0; i < 10; i++) { ltimer.reset_start(); network->forward(); network->wait(); ltimer.print_used_time(i); } //! get the output data or read tensor set in network_in size_t output_size = network->get_all_output_name().size(); output_info(network, output_size); output_data_info(network, output_size); return true; } } // namespace REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda); #endif #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}