// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n #include #include #include #include #include #include #define MIN_SIZE 5 #define MAX_SIZE 60 #define NB_SIZE 10 // number of loop for a 1x1 matrix. Lower it if the test is // too slow on you computer. #define NLOOP 2e7 typedef struct { int matrix_size; int n_loop; void (* bench_func)(); void (* blas_func)(); void * (* create_matrix)(int size); } BenchParam; void * s_create_matrix(int size) { float * r = malloc(size * sizeof(double)); int i; for(i = 0; i < size; i++) r[i] = 1e3 * i / size; return r; } void * c_create_matrix(int size) { float * r = malloc(size * 2 * sizeof(double)); int i; for(i = 0; i < 2 * size; i++) r[i] = 1e3 * i / size; return r; } void * z_create_matrix(int size) { double * r = malloc(size * 2 * sizeof(double)); int i; for(i = 0; i < 2 * size; i++) r[i] = 1e3 * i / size; return r; } void * d_create_matrix(int size) { double * r = malloc(size * sizeof(double)); int i; for(i = 0; i < size; i++) r[i] = 1e3 * i / size; return r; } void trmv_bench(BenchParam * param) { int i, n; int size = param->matrix_size; n = param->n_loop / size; int one = 1; void * A = param->create_matrix(size * size); void * y = param->create_matrix(size); for(i = 0; i < n; i++) { param->blas_func("U", "N", "N", &size, A, &size, y, &one); } free(A); free(y); } void gemv_bench(BenchParam * param) { int i, n; int size = param->matrix_size; n = param->n_loop / size; double v = 1.01; int one = 1; void * A = param->create_matrix(size * size); void * y = param->create_matrix(size); for(i = 0; i < n; i++) { param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); } free(A); free(y); } void ger_bench(BenchParam * param) { int i, n; int size = param->matrix_size; n = param->n_loop / size; double v = 1.01; int one = 1; void * A = param->create_matrix(size * size); void * y = param->create_matrix(size); for(i = 0; i < n; i++) { param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); } free(A); free(y); } #ifndef _WIN32 void * pthread_func_wrapper(void * param) { ((BenchParam *)param)->bench_func(param); pthread_exit(NULL); } #endif #define NB_TESTS 5 void * TESTS[4 * NB_TESTS] = { trmv_bench, ztrmv_, z_create_matrix, "ztrmv", gemv_bench, dgemv_, d_create_matrix, "dgemv", gemv_bench, zgemv_, z_create_matrix, "zgemv", ger_bench, dger_, d_create_matrix, "dger", ger_bench, zgerc_, z_create_matrix, "zgerc", }; inline static double delta_time(struct timespec tick) { struct timespec tock; clock_gettime(CLOCK_MONOTONIC, &tock); return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; } double pthread_bench(BenchParam * param, int nb_threads) { #ifdef _WIN32 return 0; #else BenchParam threaded_param = *param; pthread_t threads[nb_threads]; int t, rc; struct timespec tick; threaded_param.n_loop /= nb_threads; clock_gettime(CLOCK_MONOTONIC, &tick); for(t=0; tbench_func(param); return delta_time(tick); } double omp_bench(BenchParam * param) { BenchParam threaded_param = *param; struct timespec tick; int t; int nb_threads = omp_get_max_threads(); threaded_param.n_loop /= nb_threads; clock_gettime(CLOCK_MONOTONIC, &tick); #pragma omp parallel for for(t = 0; t < nb_threads; t ++){ param->bench_func(&threaded_param); } return delta_time(tick); } int main(int argc, char * argv[]) { double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); BenchParam param; int test_id; printf ("Running on %d threads\n", omp_get_max_threads()); for(test_id = 0; test_id < NB_TESTS; test_id ++) { double size = MIN_SIZE; param.bench_func = TESTS[test_id * 4]; param.blas_func = TESTS[test_id * 4 + 1]; param.create_matrix = TESTS[test_id * 4 + 2]; printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); param.n_loop = NLOOP; while(size <= MAX_SIZE) { param.matrix_size = (int)(size + 0.5); double seq_time = seq_bench(¶m); double omp_time = omp_bench(¶m); double pthread_time = pthread_bench(¶m, omp_get_max_threads()); printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " "pthread %gs, speedup %g\n", param.matrix_size, seq_time, omp_time, seq_time / omp_time, pthread_time, seq_time / pthread_time); size *= inc_factor; } } return(0); }