#include #include #include #include #include #include #include #include "utils.h" #include #define DOUBLE_PRECISION #ifdef VERBOSE void print_matrix(double *M, size_t n) { int i, j; printf("M = [ \n"); for (i = 0; i < n; i++) { printf(" "); for (j = 0; j < n; j++) printf("%.4f ", M[i * n + j]); printf("\n"); } printf(" ]\n"); return; } #endif static inline int test_results(double *a, double *b, size_t n) { int i, j; for (i = 0; i < n; i++) for (j = 0; j < n; j++) if (b[i * n + j] < (1.0 - TOLERANCE) * a[i * n + j] || b[i * n + j] > (1.0 + TOLERANCE) * a[i * n + j]) { printf("Element %d,%d: %f != %f!\n", i, j, a[i * n + j], b[i * n + j]); return 1; } return 0; } int test_mcl(double *A, double *B, double *C, size_t N) { /** A true tiled matrix multiply would separate a matrix on both rows and columns * however, this creates a different memory pattern that can't be as easily expressed * with the simple subbuffer interafe that we have created thus far. So as a functionality * test, I am just breaking matrix A, and matrix C by row. For the best performance on GEMM * we still need to implement more functionality **/ char src_path[1024]; strcpy(src_path, XSTR(_MCL_TEST_PATH)); strcat(src_path, "/gemmN.cl"); struct timespec start, end; mcl_handle **hdl = NULL; const size_t msize = N * N * sizeof(double); const size_t row_size = N * sizeof(double); const size_t tile_size = 64 * row_size; /** TODO: This should be parametric **/ const size_t num_tiles = msize / tile_size; uint64_t pes[MCL_DEV_DIMS] = {64, N, 1}; uint64_t i, j; unsigned int errs = 0; float rtime; int ret; printf("tile_size: %ld\n", tile_size); printf("MCL Test..."); hdl = (mcl_handle **)malloc(sizeof(mcl_handle *) * num_tiles * rep); if (!hdl) { printf("Error allocating memmory for MCL hanlders. Aborting.\n"); goto err; } #ifdef DOUBLE_PRECISION mcl_prg_load(src_path, "-DDOUBLE_PRECISION", MCL_PRG_SRC); #else mcl_prg_load(src_path, "-DSINGLE_PRECISION", MCL_PRG_SRC); #endif clock_gettime(CLOCK_MONOTONIC, &start); mcl_register_buffer(C, msize, MCL_ARG_RESIDENT | MCL_ARG_DYNAMIC | MCL_ARG_OUTPUT); for (i = 0; i < rep; i++) { for (j = 0; j < num_tiles; j++) { hdl[(i * num_tiles) + j] = mcl_task_create(); if (!hdl[(i * num_tiles) + j]) { printf("Error creating MCL task. Aborting.\n"); goto err_hdl; } if (mcl_task_set_kernel(hdl[(i * num_tiles) + j], "gemmN", 4)) { printf("Error setting %s kernel. Aborting.\n", "gemmN"); goto err_hdl; } if (mcl_task_set_arg(hdl[(i * num_tiles) + j], 0, (void *)A + (tile_size * j), tile_size, MCL_ARG_INPUT | MCL_ARG_BUFFER | MCL_ARG_RESIDENT)) { printf("Error setting up task input A. Aborting.\n"); goto err_hdl; } if (mcl_task_set_arg(hdl[(i * num_tiles) + j], 1, (void *)B, msize, MCL_ARG_INPUT | MCL_ARG_BUFFER | MCL_ARG_RESIDENT)) { printf("Error setting up task input B. Aborting.\n"); goto err_hdl; } if (mcl_task_set_arg(hdl[(i * num_tiles) + j], 2, (void *)&N, sizeof(int), MCL_ARG_INPUT | MCL_ARG_SCALAR)) { printf("Error setting up task input N. Aborting.\n"); goto err_hdl; } if (mcl_task_set_arg_buffer(hdl[(i * num_tiles) + j], 3, (void *)C, tile_size, tile_size * j, MCL_ARG_BUFFER | MCL_ARG_DYNAMIC | MCL_ARG_RESIDENT | MCL_ARG_OUTPUT)) { printf("Error setting up task output. Aborting.\n"); goto err_hdl; } if ((ret = mcl_exec(hdl[(i * num_tiles) + j], pes, NULL, flags))) { printf("Error submitting task (%d)! Aborting.\n", ret); goto err_hdl; } } if (synct) for (j = 0; j < num_tiles; j++) { if (mcl_wait(hdl[(i * num_tiles) + j])) { printf("Request timed out!\n"); goto err_hdl; } } } if (!synct) { printf("Waiting for all MCL tasks to complete..."); fflush(stdout); if (mcl_wait_all()) { printf("Error waiting for requests to complete!\n"); goto err_hdl; } printf("done!\n"); } mcl_unregister_buffer(C); clock_gettime(CLOCK_MONOTONIC, &end); for (i = 0; i < rep; i++) if (hdl[i]->ret == MCL_RET_ERROR) { printf("Error executing task %" PRIu64 "!\n", i); errs++; } if (errs) printf("Detected %u errors!\n", errs); else { rtime = ((float)tdiff(end, start)) / BILLION; printf("Done.\n Test time : %f seconds\n", rtime); printf(" Throughput: %f tasks/s\n", ((float)rep) / rtime); } for (i = 0; i < rep; i++) mcl_hdl_free(hdl[i]); free(hdl); return errs; err_hdl: free(hdl); err: return -1; } int gemm_seq(double *A, double *B, double *C, size_t N) { int i, j, k; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { for (k = 0; k < N; ++k) { C[i * N + j] += A[i * N + k] * B[k * N + j]; } } } return 0; } int main(int argc, char **argv) { double *A, *B, *C, *C_test; int i, j, ret = -1; struct timespec start, end; mcl_banner("GEMM N Test"); parse_global_opts(argc, argv); switch (type) { case 0: { flags = MCL_TASK_CPU; break; } case 1: { flags = MCL_TASK_GPU; break; } case 2: { flags = MCL_TASK_ANY; break; } default: { printf("Unrecognized resource type (%" PRIu64 "). Aborting.\n", type); return -1; } } mcl_init(workers, 0x0); A = (double *)malloc(size * size * sizeof(double)); B = (double *)malloc(size * size * sizeof(double)); C = (double *)malloc(size * size * sizeof(double)); C_test = (double *)malloc(size * size * sizeof(double)); if (!A || !B || !C || !C_test) { printf("Error allocating vectors. Aborting."); goto err; } srand48(13579862); for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { A[i * size + j] = (double)(0.5 + drand48() * 1.5); } } for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { B[i * size + j] = (double)(0.5 + drand48() * 1.5); } } for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { C[i * size + j] = 0.0; C_test[i * size + j] = 0.0; } } #ifdef VERBOSE print_matrix(A, size); printf("\n"); print_matrix(B, size); #endif if (verify) { printf("Sequential Test..."); clock_gettime(CLOCK_MONOTONIC, &start); gemm_seq(A, B, C_test, size); clock_gettime(CLOCK_MONOTONIC, &end); printf("Done.\n Test time: %f seconds\n", ((float)tdiff(end, start)) / BILLION); #ifdef VERBOSE print_matrix(C_test, size); #endif printf("\n"); } ret = test_mcl(A, B, C, size); if (verify && !ret) { #ifdef VERBOSE print_matrix(C, size); #endif mcl_finit(); mcl_verify(ret); free(A); free(B); free(C); free(C_test); } err: return ret; }