/**
 * \file sdk/c-opr-loaders/mace/mace_loader.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include <numeric>
#include <iostream>
#include <sys/stat.h>

#include "mace/public/mace.h"
#include "extern_c_opr.h"

#if defined(__APPLE__) || defined(__MACOSX)
static const char* default_so_paths[] = {
        "/System/Library/Frameworks/OpenCL.framework/OpenCL", "libOpenCL.so"};
#elif defined(__ANDROID__)
static const char* default_so_paths[] = {
#if defined(__aarch64__)
        "/system/lib64/libOpenCL.so",
        "/system/lib64/libOpenCL_system.so",
        "/system/lib64/egl/libGLES_mali.so",
        "/system/vendor/lib64/libOpenCL.so",
        "/system/vendor/lib64/egl/libGLES_mali.so",
        "/system/vendor/lib64/libPVROCL.so",
        "/vendor/lib64/libOpenCL.so",
        "/data/data/org.pocl.libs/files/lib64/libpocl.so",
#else
        "/system/lib/libOpenCL.so",
        "/system/lib/libOpenCL_system.so",
        "/system/lib/egl/libGLES_mali.so",
        "/system/vendor/lib/libOpenCL.so",
        "/system/vendor/lib/egl/libGLES_mali.so",
        "/system/vendor/lib/libPVROCL.so",
        "/vendor/lib/libOpenCL.so",
        "/data/data/org.pocl.libs/files/lib/libpocl.so",
#endif
        "libOpenCL.so"};
#elif defined(_WIN32)
static const char* default_so_paths[] = {"OpenCL.dll"};
#elif defined(__linux__)
static const char* default_so_paths[] = {
#if defined(__x86_64__) || defined(__amd64__)
        "/usr/lib64/libOpenCL.so", "/usr/local/lib64/libOpenCL.so",
        "/usr/local/cuda/lib64/libOpenCL.so",
        "/opt/intel/opencl/libOpenCL.so",
        //! As in some system like apex, the driver exists here
        "/usr/lib/libOpenCL.so",
#else
        "/usr/lib/libOpenCL.so",
        "/usr/lib32/libOpenCL.so",
        "/usr/local/lib/libOpenCL.so",
        "/usr/local/lib/libpocl.so",
        "/usr/local/cuda/lib/libOpenCL.so",
#endif
        "libOpenCL.so"};
#endif


#define ASSERT(x, msg)                                                       \
    do {                                                                     \
        if (!(x)) {                                                          \
            printf("error at %s:%d %s\n", __FILE__, __LINE__, __FUNCTION__); \
            printf(msg);                                                     \
            __builtin_trap();                                                \
        }                                                                    \
    } while (0)

inline bool file_exists (const char* name) {
    struct stat buffer;
    return (stat (name, &buffer) == 0);
}

class MGBOprDescImpl {
    struct UserData {
        std::shared_ptr<mace::MaceEngine> engine;
        size_t nr_inputs, nr_outputs;
        std::vector<std::vector<int64_t>> output_shapes;
        std::vector<std::string> input_names, output_names;
    };

    static UserData* user_data(const MGBOprDesc* self) {
        return static_cast<UserData*>(self->user_data);
    }

    static void release(MGBOprDesc* self) {
        // free all data buffers
        delete user_data(self);
        delete self;
    }

    static size_t hash(const MGBOprDesc* self) {
        return reinterpret_cast<size_t>(self);
    }

    static int is_same(const MGBOprDesc* self, const MGBOprDesc* rhs) {
        return self == rhs;
    }

    static void infer_shape(const MGBOprDesc* self, const MGBTensorShape* input,
                            MGBTensorShape* output) {
        auto ud = user_data(self);

        // infer output shape from user data
        for (size_t i = 0; i < ud->nr_outputs; i++) {
            output[i].ndim = ud->output_shapes[i].size();
            for (size_t j = 0; j < output[i].ndim; j++) {
                output[i].shape[j] = ud->output_shapes[i][j];
            }
        }
    }

    static void infer_dtype(const MGBOprDesc*, const MGBDType* input, MGBDType* output) {
        ASSERT(input[0] == MGB_DTYPE_FLOAT32, "Input dtype is not float32");
        output[0] = MGB_DTYPE_FLOAT32;
    }

    static void execute(const MGBOprDesc* self, const MGBTensor* input,
                        const MGBTensor* output) {
        auto ud = user_data(self);

        // create input and output tensor buffers
        std::map<std::string, mace::MaceTensor> mace_inputs;
        std::map<std::string, mace::MaceTensor> mace_outputs;

        auto mace_data_format = mace::DataFormat::NCHW;
        char *data_format = getenv("MGB_MACE_LOADER_FORMAT");
        if (data_format != nullptr && !strcmp(data_format, "NHWC")) {
            mace_data_format = mace::DataFormat::NHWC;
        }

        for (size_t i = 0; i < ud->nr_inputs; ++i) {
            // allocate input
            uint32_t ndim = input[i].layout.shape.ndim;
            auto input_shape = std::vector<int64_t>(input[i].layout.shape.shape,
                                                    input[i].layout.shape.shape + ndim);

            int64_t input_size =
                std::accumulate(input_shape.begin(), input_shape.end(), 1,
                                std::multiplies<uint64_t>());
            auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                                    std::default_delete<float[]>());
            memcpy(buffer_in.get(), input[i].data, input_size * sizeof(float));
            mace_inputs[ud->input_names[i]] =
                mace::MaceTensor(input_shape, buffer_in, mace_data_format);
        }

        for (size_t i = 0; i < ud->nr_outputs; ++i) {
            // allocate output
            uint32_t ndim = output[i].layout.shape.ndim;
            auto output_shape = std::vector<int64_t>(output[i].layout.shape.shape,
                                                     output[i].layout.shape.shape + ndim);

            int64_t output_size =
                std::accumulate(output_shape.begin(), output_shape.end(), 1,
                                std::multiplies<int64_t>());
            auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                                     std::default_delete<float[]>());
            mace_outputs[ud->output_names[i]] =
                mace::MaceTensor(output_shape, buffer_out, mace_data_format);
        }

        // run the model
        auto status = (ud->engine)->Run(mace_inputs, &mace_outputs);
        ASSERT(status == mace::MaceStatus::MACE_SUCCESS,
               "Error in running mace engine");

        // send computed output to MGB
        int idx = 0;
        for (auto it = mace_outputs.begin(); it != mace_outputs.end(); it++) {
            float* to = &((float *)output[idx++].data)[0];
            to = (it->second).data().get();
        }
    }

public:
    static MGBOprDesc* make(size_t nr_input, const void *buf, size_t buf_len) {
        auto ud = std::make_unique<UserData>();

        std::shared_ptr<mace::MaceEngine> engine;

        mace::DeviceType device_type = mace::DeviceType::CPU;
        char *runtime_mode = getenv("MGB_MACE_RUNTIME");
        if (runtime_mode != nullptr && !strcmp(runtime_mode, "GPU")) {
            device_type = mace::DeviceType::GPU;
        }
        mace::MaceEngineConfig config(device_type);

        // set number of threads for cpu, default 1
        if (device_type == mace::DeviceType::CPU) {
            int nthread = 1;
            char *str_nthread = getenv("MGB_MACE_NR_THREADS");
            if (str_nthread != nullptr) {
                nthread = atoi(str_nthread);
            }

            config.SetCPUThreadPolicy(nthread, mace::CPUAffinityPolicy::AFFINITY_NONE);
        }

        // set gpu context, mainly opencl path
        if (device_type == mace::DeviceType::GPU) {
            std::shared_ptr<mace::GPUContext> gpu_context;

            char *cache_path = getenv("MGB_MACE_OPENCL_CACHE_PATH");
            ASSERT(cache_path, "there must be an opencl cache file path");

            char *param_path = getenv("MGB_MACE_TUNING_PARAM_PATH");
            std::string opencl_param_path("");
            if (param_path != nullptr) {
                opencl_param_path = std::string(param_path);
            }

            std::string storage_path(cache_path);
            gpu_context = mace::GPUContextBuilder()
                            .SetStoragePath(storage_path)
                            .SetOpenCLParameterPath(opencl_param_path)
                            .Finalize();

            config.SetGPUContext(gpu_context);
            config.SetGPUHints(
                static_cast<mace::GPUPerfHint>(mace::GPUPerfHint::PERF_HIGH),
                static_cast<mace::GPUPriorityHint>(mace::GPUPriorityHint::PRIORITY_HIGH));
        }

        std::vector<std::string> input_names, output_names;

        // extract all information from buf

        void *buffer = const_cast<void *>(buf);

        ud->nr_inputs = *reinterpret_cast<uint32_t*>(buffer);
        ud->nr_outputs = *(reinterpret_cast<uint32_t*>(buffer) + 1);

        // interpret input names
        char *name_buf = reinterpret_cast<char*>(buffer) + 8;
        for (size_t i = 0; i < ud->nr_inputs; i++) {
            size_t ilen = *reinterpret_cast<uint32_t*>(name_buf);
            input_names.push_back(std::string(name_buf + 4, ilen));
            name_buf += (ilen + 4);
        }

        // interpret output names
        buffer = name_buf;
        name_buf = reinterpret_cast<char*>(buffer);
        for (size_t i = 0; i < ud->nr_outputs; i++) {
            size_t olen = *reinterpret_cast<uint32_t*>(name_buf);
            output_names.push_back(std::string(name_buf + 4, olen));
            name_buf += (olen + 4);
        }

        ud->input_names = input_names;
        ud->output_names = output_names;

        // interpret output shapes
        buffer = name_buf;
        uint32_t *shape_buf = reinterpret_cast<uint32_t*>(buffer) + 1;
        for (size_t i = 0; i < ud->nr_outputs; i++) {
            size_t olen = *reinterpret_cast<int*>(shape_buf);
            ud->output_shapes.push_back(
                std::vector<int64_t>(shape_buf + 1, shape_buf + olen + 1)
            );
            shape_buf += (olen + 1);
        }

        buffer = shape_buf;
        const size_t model_buf_len = *reinterpret_cast<int*>(buffer);
        unsigned char *model_buf = reinterpret_cast<unsigned char*>(buffer) + 4;

        const size_t param_buf_len = *reinterpret_cast<int*>(model_buf + model_buf_len);
        unsigned char *param_buf = model_buf + model_buf_len + 4;

        // create mace engine
        auto create_engine_status = mace::CreateMaceEngineFromProto(
            model_buf,
            model_buf_len,
            param_buf,
            param_buf_len,
            input_names,
            output_names,
            config,
            &engine
        );
        ASSERT(create_engine_status == mace::MaceStatus::MACE_SUCCESS,
               "Error in creating mace engine");

        ud->engine = engine;

        auto ret = std::make_unique<MGBOprDesc>();
        mgb_init_opr_desc(ret.get(), ud->nr_outputs, "mace");
#define a(n) ret->n = &n;
        MGB_OPR_DESC_FOREACH_MEM_FN(a);
        a(infer_dtype);
#undef a
        ret->user_data = ud.release();
        return ret.release();
    }
};

class MGBOprLoaderImpl {
    static MGBOprDesc* create_desc(size_t nr_input, const void *buf,
            size_t buf_len)
    {
        return MGBOprDescImpl::make(nr_input, buf, buf_len);
    }
public:
    static MGBOprLoader make() {
        return {"mace", create_desc};
    }
};

extern "C" {

// public interface
__attribute__((visibility("default")))
void MGB_C_OPR_INIT_FUNC(const MGBExternCOprApi* (*get_api)(int))
{
    const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION);
    ASSERT(api, "Create api failed");
    MGBOprLoader loader = MGBOprLoaderImpl::make();
    api->register_loader(&loader);
}

}  // extern "C"