#!/bin/bash OLD_IFS="$IFS" IFS=$'\n' TYPICAL_MODEL_DATD="model_source/resnet50_b1_float32_without_data.mge --input \"data:input_data/resnet50_input.npy\"" DEVICE_DESC="" WORK_DIR_PATH="." RUN_ARM_DEVICE="false" RUN_TARGET="diff_model" ONLY_PREPARE_MODEL="false" MODEL_PREAPRED="false" ONLY_BUILD="false" LAR_BUILT="false" CLEAN_ALL="false" RUN_TARGETS=("diff_model") RUN_TARGETS+=("diff_device") RUN_TARGETS+=("fast_run") RUN_TARGETS+=("io") RUN_TARGETS+=("layout") RUN_TARGETS+=("optimize") RUN_TARGETS+=("plugin") RUN_TARGETS+=("all") function usage() { echo "$0 args1 args2 .." echo "available args detail:" echo "-p : prepare example model " echo "-b : build load_and_run for x86/armv7/arm64 cpu and CUDA" echo "-t : set the ssh arm device " echo "-w : set the arm device workspace dir" echo "-c : clean all" echo "-a : run all test" echo "-e : set the running target for test (details use \"-e\" to see)" echo "-h : show usage" exit -1 } while getopts "pbcahe:w:t:" arg do case $arg in t) DEVICE_DESC=$OPTARG RUN_ARM_DEVICE="true" echo "config arm device DEVICE_DESC to ${DEVICE_DESC}" ;; w) WORK_DIR_PATH=$OPTARG echo "config arm device WORK_DIR_PATH to ${WORK_DIR_PATH}" ;; e) tmp_target=null for target in ${RUN_TARGETS[@]}; do if [ "$target" = "$OPTARG" ]; then echo "CONFIG BUILD RUN_TARGET to : $OPTARG" tmp_target=$OPTARG RUN_TARGET=$OPTARG break fi done if [ "$tmp_target" = "null" ]; then echo "ERR args for target (-e)" echo "available target usage :" for target in ${RUN_TARGETS[@]}; do echo " -e $target" done exit -1 fi ;; h) echo "show usage" usage ;; a) echo "config RUN_TARGET=all" RUN_TARGET="all" ;; c) echo "clean all directory generated by script" CLEAN_ALL="true" ;; b) echo "run build" ONLY_BUILD="true" ;; p) echo "prepare model and input" ONLY_PREPARE_MODEL="true" ;; ?) echo "unkonw argument" usage ;; esac done function prepare_model_and_data(){ rm -rf model_source && mkdir model_source # dump mgb model python3 script/resnet50_mgb.py -o model_source/resnet50.pkl ../dump_with_testcase.py model_source/resnet50.pkl -o model_source/resnet50_with_data.mgb -d "#rand(0, 255)" --no-assert # prepare simple add model python3 script/add_demo.py --dir model_source python3 script/conv_demo.py --dir model_source #generate trt model script/gen_trt_model.sh #prepare mge model python3 script/resnet50_mge.py --dir model_source python3 script/resnet50_mge.py --dir model_source -d int8 python3 script/resnet50_mge.py --dir model_source --inputs "#rand(0,255)" #make input_data rm -rf input_data && mkdir input_data python3 script/mge_input_data.py rm -rf tmpdir && mkdir tmpdir } function build_lar(){ # build cpu and cuda version ../../../scripts/cmake-build/host_build.sh -r -t -e load_and_run #WARNING:config the cuda environment before compile ../../../scripts/cmake-build/host_build.sh -c -t -e load_and_run # # build arm version ../../../scripts/cmake-build/cross_build_android_arm_inference.sh -r -a arm64-v8a -e load_and_run ../../../scripts/cmake-build/cross_build_android_arm_inference.sh -r -a armeabi-v7a -e load_and_run # link or for test ln -s ../../../build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release//build/lite/load_and_run/load_and_run lar_cpu ln -s ../../../build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release//build/lite/load_and_run/load_and_run lar_cuda cp ../../../build_dir/android/arm64-v8a/Release/build/lite/load_and_run/load_and_run ./lar_arm64 cp ../../../build_dir/android/armeabi-v7a/Release/build/lite/load_and_run/load_and_run ./lar_armv7 } function set_arm_device_and_upload(){ DEVICE_DESC="${1}" WORK_DIR_PATH="${2}" RUN_ARM_DEVICE="${3}" cmd="rsync -aP -zz ./lar_arm64 ./lar_armv7 model_source/resnet50_b1_float32_without_data.mge input_data/resnet50_input.npy $DEVICE_DESC:$WORK_DIR_PATH/" echo $cmd bash -c "$cmd" } function test_different_model(){ CmdArray=("./lar_cpu model_source/resnet50_with_data.mgb") CmdArray+=("./lar_cpu model_source/resnet50_b1_float32_with_data.mge") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --lite") for cmd in ${CmdArray[@]}; do echo "$cmd" bash -c "$cmd" done } function test_different_device(){ #dispatch时,计算任务会加入一个工作队列,由队列统一管理执行 均值 131.278 ms 标准差 15.197ms m_asyc_exec异步执行 CmdArray=("./lar_cpu $TYPICAL_MODEL_DATD --cpu") #dispatch时,计算任务直接执行 均值 131.875 ms 标准差 7.758ms m_asyc_exec同步执行 CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --cpu-default") #多线程运行 1~8平均运行时间(ms):129.611, 84.266, 76.963, 55.212, 69.283, 77.338, 58.386, 64.585 CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --multithread 4") #主线程锁核,其他任务在线程池中的线程上运行 132.614, 83.095, 69.792, 54.452, 48.890, 48.206, 46.386, 53.908 CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --multithread-default 4") #cpu多线程绑核(x86上绑核影响不大) CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --multithread 2 --multi-thread-core-ids 1,5") #xpu 设置为cpu上运行 132.740 ms comp_node:cpu CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cpu") #xpu 设置为cuda上运行 6.495 ms comp_node:gpu CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cuda") for cmd in ${CmdArray[@]}; do echo $cmd bash -c "$cmd" done function test_fast_run(){ CmdArray=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --full-run") #fast run 搜参 CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --fast-run-algo-policy tmpdir/algo_cache_file") #fast run 带参执行 CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run-algo-policy tmpdir/algo_cache_file") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --reproducible") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --fast-run-shared-batch-size 1") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --fast-run --binary-equal-between-batch") for cmd in ${CmdArray[@]}; do echo $cmd bash -c "$cmd" done } function test_io(){ rm -rf tmpdir/bin_io_info tmpdir/bin_out_info tmpdir/bin_out_info_cuda tmpdir/io_info.txt mkdir tmpdir/bin_io_info tmpdir/bin_out_info tmpdir/bin_io_info_cuda CmdArray=("./lar_cpu $TYPICAL_MODEL_DATD") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"input_data/add_demo_input.json\"") #! the model must support input with nhwc shape CmdArray+=("./lar_cpu model_source/resnet50_b1_int8_without_data.mge --input \"data:input_data/cat.ppm\"") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\"") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --io-dump tmpdir/io_info.txt --iter 1 --warmup-iter 0") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --io-dump-stdout --iter 1 --warmup-iter 0") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --io-dump-stderr --iter 1 --warmup-iter 0") #different data in the given directory the name is the var id which is the same with txt-dump information CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --bin-io-dump tmpdir/bin_io_info --iter 1 --warmup-iter 0") CmdArray+=("./lar_cuda model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --bin-io-dump tmpdir/bin_io_info_cuda --iter 1 --warmup-iter 0") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --bin-out-dump tmpdir/bin_out_info --iter 1 --warmup-iter 0") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --copy-to-host") for cmd in ${CmdArray[@]}; do echo $cmd bash -c "$cmd" done #compare the binary io information python3 ../../../imperative/python/megengine/tools/compare_binary_iodump.py tmpdir/bin_io_info tmpdir/bin_io_info_cuda } function test_layout_related(){ # very little speed up CmdArray=("./lar_cuda $TYPICAL_MODEL_DATD --enable-nchw4") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-chwn4") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-nchw32") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-nchw64") #speed up CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --enable-nchw88") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cuda --layout-transform cuda") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --cuda --layout-transform cuda --layout-transform-dump model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cuda.mge") CmdArray+=("./lar_cuda model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cuda.mge") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --cpu --layout-transform cpu") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --cpu --layout-transform cpu --layout-transform-dump model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cpu.mge") CmdArray+=("./lar_cpu model_source/resnet50_b1_float32_without_data_with_global_layout_trans_cpu.mge") for cmd in ${CmdArray[@]}; do echo $cmd bash -c "$cmd" done if [ ${RUN_ARM_DEVICE} == "true" ] ; then #speed up CmdArray=("./lar_arm64 resnet50_b1_float32_without_data.mge --input \"data:resnet50_input.npy\" --cpu --enable-nchw44") #speed up CmdArray+=("./lar_arm64 resnet50_b1_float32_without_data.mge --input \"data:resnet50_input.npy\" --cpu --enable-nchw44-dot") for cmd in ${CmdArray[@]}; do echo $cmd ssh -t $DEVICE_DESC "unset LD_PRELOAD && cd $WORK_DIR_PATH && LD_LIBRARY_PATH=./ $cmd" done else echo "SET arm device ON : $RUN_ARM_DEVICE" fi } function test_optimize(){ CmdArray=("./lar_cpu $TYPICAL_MODEL_DATD --enable-fuse-preprocess") #warm up speed up CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-fuse-conv-bias-nonlinearity") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable-fuse-conv-bias-with-z") CmdArray+=("./lar_cuda model_source/trt_conv_demo_with_data.mgb --tensorrt") CmdArray+=("./lar_cuda model_source/trt_conv_demo_with_data.mgb --tensorrt --tensorrt-cache tmpdir/TRT_cache") CmdArray+=("./lar_cuda model_source/trt_conv_demo_with_data.mgb --tensorrt-cache tmpdir/TRT_cache") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --no-sanity-check --record-comp-seq2") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --disable_mem_opt") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --workspace_limit 10000") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --fake-first") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --enable_jit ") for cmd in ${CmdArray[@]}; do echo $cmd bash -c "$cmd" done } function test_plugin(){ rm -rf tmpdir/staticMemInfoDir tmpdir/staticMemInfoDirLogs mkdir tmpdir/staticMemInfoDir CmdArray=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --check-dispatch") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --check-var-value 5:0") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --range 2") CmdArray+=("./lar_cpu model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --profile tmpdir/opr_profile.json ") CmdArray+=("./lar_cuda model_source/add_demo_f32_without_data.mge --input \"data:[2,3,4]\" --profile-host tmpdir/opr_profile_host.json") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --model-info") CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --verbose") CmdArray+=("./lar_cpu model_source/resnet50_with_data.mgb --disable-assert-throw") # wait gdb attach to given PID # CmdArray+=("./lar_cpu $TYPICAL_MODEL_DATD --wait-gdb") CmdArray+=("./lar_cuda $TYPICAL_MODEL_DATD --get-static-mem-info tmpdir/staticMemInfoDir") for cmd in ${CmdArray[@]}; do echo $cmd bash -c "$cmd" done # view the graph with given url (usally: http://localhost:6006/) # mkdir tmpdir/staticMemInfoDirLogs && python3 ../../../imperative/python/megengine/tools/graph_info_analyze.py -i tmpdir/staticMemInfoDir -o tmpdir/staticMemInfoDirLogs # pip3 install tensorboard && tensorboard --logdir tmpdir/staticMemInfoDirLogs } function clean(){ rm -rf tmpdir model_source input_data lar_cpu lar_cuda lar_arm64 lar_armv7 } function main(){ if [ ${CLEAN_ALL} == "true" ] ; then clean exit 0 fi if [ ${ONLY_PREPARE_MODEL} == "true" ] ; then prepare_model_and_data MODEL_PREAPRED="true" exit 0 fi if [ ${ONLY_BUILD} == "true" ] ; then build_lar LAR_BUILT="true" exit 0 fi if [ ${RUN_ARM_DEVICE} == "true" ] ; then set_arm_device_and_upload $DEVICE_DESC "$WORK_DIR_PATH" "true" fi if [ ${MODEL_PREAPRED} != "true" ] ; then CHECK_MODEL=$(find . -name add_demo_input.json) if [ ${CHECK_MODEL} == "" ] ; then prepare_model_and_data MODEL_PREAPRED="true" fi fi if [ ${LAR_BUILT} != "true" ] ; then CHECK_LAR=$(find . -name lar_armv7) if [ ${CHECK_LAR} == "" ] ; then build_lar LAR_BUILT="true" fi fi if [ ${RUN_TARGET} == "diff_model" -o ${RUN_TARGET} == "all" ] ; then test_different_model fi if [ ${RUN_TARGET} == "diff_device" -o ${RUN_TARGET} == "all" ] ; then test_different_device fi if [ ${RUN_TARGET} == "fast_run" -o ${RUN_TARGET} == "all" ] ; then test_fast_run fi if [ ${RUN_TARGET} == "io" -o ${RUN_TARGET} == "all" ] ; then test_io fi if [ ${RUN_TARGET} == "layout" -o ${RUN_TARGET} == "all" ] ; then test_layout_related fi if [ ${RUN_TARGET} == "optimize" -o ${RUN_TARGET} == "all" ] ; then test_optimize fi if [ ${RUN_TARGET} == "plugin" -o ${RUN_TARGET} == "all" ] ; then test_plugin fi } main IFS=$OLD_IFS