/* * Copyright (c) 2021-2022 Intel Corporation. All rights reserved. * * This software is available to you under the BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* * memcopy-xe.c * * This is a memory copy bandwidth test with buffers allocated via oneAPI L0 * functions. * * Copy using memcpy() between buffers allocated with malloc(): * * ./xe_memcopy -c memcpy M M * * Copy using memcpy between buffers allocated with zeMemAllocHost(): * * ./xe_memcopy -c memcpy H H * * Copy using GPU between buffers allocated with zeMemAllocDevice(): * * ./xe_memcopy -c cmdq D D * * Copy using GPU from buffer allocated with zeMemAllocDevice() to buffer * allcated with zeMemAllocHost(): * * ./xe_memcopy -c cmdq D H * * Copy using GPU between buffers allocated with zeMemAllocDevice(), with * cached command list: * * ./xe_memcopy -c cmdq -C D D * * Copy using GPU between buffers allocated with zeMemAllocDevice(), with * immediate command list: * * ./xe_memcopy -c cmdq -i D D * * Copy using GPU between buffers allocated with zeMemAllocDevice(), using * specified device (src=dst=0): * * ./xe_memcopy -c cmdq -d 0 D D * * Copy using GPU between buffers allocated with zeMemAllocDevice(), using * specified device (src=0, dst=1): * * ./xe_memcopy -c cmdq -d 0 -D 1 D D * * Copy using GPU between buffers allocated with zeMemAllocDevice(), using * specified command queue group: * * ./xe_memcopy -c cmdq -G 2 D D * * Copy using GPU between buffers allocated with zeMemAllocDevice(), using * specified devices, command queue group and engine index: * * ./xe_memcopy -c cmdq -d 0 -D 1 -G 2 -I 1 D D * * Copy using GPU between buffers allocated with zeMemAllocDevice(), using * specified devices, 2nd device's command queue group and engine index: * * ./xe_memcopy -c cmdq -d 0 -D 1 -r -G 2 -I 1 D D * * Copy using mmap + memcpy between buffers allocated with zeMemAllocDevice(), * mmap() is called before each memcpy(): * * ./xe_memcopy -c mmap D D * * Copy using mmap + memcpy between buffers allocated with zeMemAllocDevice(), * mmap() is called once (cached): * * ./xe_memcopy -c mmap -C D D * * For more options: * * ./xe_memcopy -h */ #include #include #include #include #include #include #include #include #include #include "util.h" #include "xe.h" #define MALLOC 0 #define HOST 1 #define DEVICE 2 #define SHARED 3 struct my_buf { int where; void *buf; size_t size; int fd; void *map; size_t map_size; }; const size_t gpu_page_size = 65536; static int iterations = 1000; #define CMDQ 0 #define MEMCPY 1 #define MMAP 2 static ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t drv, void *ptr, size_t len); static ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t drv, void *ptr); static int copy_method = CMDQ; static int use_imm_cmdl = 0; static int cache_cmdl = 0; static int cache_mmap = 0; static int mmap_all = 1; static int reverse = 0; static int import = 0; static unsigned int card_num = 0; static unsigned int card_num2 = 0xFF; static unsigned int ordinal = 0; static unsigned int engine_index = 0; #define MAX_MSG_SIZE (64*1024*1024) static jmp_buf env; void segv_handler(int signum) { printf("Segmentation fault caught while accessing device buffer\n"); longjmp(env, 1); } static struct { ze_driver_handle_t drv; ze_context_handle_t ctxt; ze_device_handle_t dev; ze_device_handle_t dev2; ze_command_queue_handle_t cmdq; ze_command_list_handle_t cmdl; } gpu; void show_device_properties(ze_device_properties_t *props) { printf("vendor_id: 0x%x, device_id: 0x%x, name: %s, type: 0x%x, flags: 0x%x\n", props->vendorId, props->deviceId, props->name, props->type, props->flags); printf("\tsubdevice_id: 0x%x, core_clock: %d, max_mem_alloc: %ld, " "max_hw_ctxts: %d, threads_per_EU: %d, slices: %d\n", props->subdeviceId, props->coreClockRate, props->maxMemAllocSize, props->maxHardwareContexts, props->numThreadsPerEU, props->numSlices); } void show_cmdq_group_info(ze_device_handle_t dev) { ze_command_queue_group_properties_t *props; uint32_t cnt = 0; int i; EXIT_ON_ERROR(libze_ops.zeDeviceGetCommandQueueGroupProperties(dev, &cnt, NULL)); props = calloc(cnt, sizeof(ze_command_queue_group_properties_t)); if (!props) { printf("Error: cannot allocate memory for cmdq_group_properties\n"); exit(-1); } EXIT_ON_ERROR(libze_ops.zeDeviceGetCommandQueueGroupProperties(dev, &cnt, props)); printf("\tcommand queue groups: "); for(i = 0; i < cnt; ++i) printf("%d:[%s%s]x%d%s", i, (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) ? "comp," : "", (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) ? "copy" : "", props[i].numQueues, i < cnt -1 ? ", " : "\n"); } void init_gpu(void) { uint32_t count; ze_device_handle_t *all_devices; ze_device_properties_t device_properties; ze_context_desc_t ctxt_desc = {}; ze_command_queue_desc_t cq_desc = { .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, .ordinal = ordinal, .index = engine_index, .flags = 0, .mode = use_imm_cmdl ? ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS : ZE_COMMAND_QUEUE_MODE_DEFAULT, .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, }; ze_command_list_desc_t cl_desc = { .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, .commandQueueGroupOrdinal = ordinal, .flags = 0, }; ze_bool_t succ; ze_device_handle_t cmdq_dev; EXIT_ON_ERROR(init_libze_ops()); EXIT_ON_ERROR(libze_ops.zeInit(ZE_INIT_FLAG_GPU_ONLY)); /* get the first driver */ count = 1; EXIT_ON_ERROR(libze_ops.zeDriverGet(&count, &gpu.drv)); if (import) { EXIT_ON_ERROR(libze_ops.zeDriverGetExtensionFunctionAddress( gpu.drv, "zexDriverImportExternalPointer", (void *)&zexDriverImportExternalPointer)); EXIT_ON_ERROR(libze_ops.zeDriverGetExtensionFunctionAddress( gpu.drv, "zexDriverReleaseImportedPointer", (void *)&zexDriverReleaseImportedPointer)); } /* get the number of GPU devices */ count = 0; EXIT_ON_ERROR(libze_ops.zeDeviceGet(gpu.drv, &count, NULL)); printf("Total number of devices: %d\n", count); if (card_num >= count || card_num2 >= count) { printf("Error: card number exeeds available devices (%d)\n", count); exit(-1); } /* get GPU devices */ all_devices = calloc(count, sizeof(*all_devices)); if (!all_devices) { printf("Error: cannot allocate memory for all_devices\n"); exit(-1); } EXIT_ON_ERROR(libze_ops.zeDeviceGet(gpu.drv, &count, all_devices)); gpu.dev = all_devices[card_num]; gpu.dev2 = all_devices[card_num2]; printf("Use device %d: %p, ", card_num, gpu.dev); libze_ops.zeDeviceGetProperties(gpu.dev, &device_properties); show_device_properties(&device_properties); show_cmdq_group_info(gpu.dev); if (card_num2 != card_num) { printf("Use device %d: %p, ", card_num2, gpu.dev2); libze_ops.zeDeviceGetProperties(gpu.dev2, &device_properties); show_device_properties(&device_properties); show_cmdq_group_info(gpu.dev2); } /* check peer access capability */ EXIT_ON_ERROR(libze_ops.zeDeviceCanAccessPeer(gpu.dev, gpu.dev2, &succ)); printf("Peer access from device %d to device %d is %s\n", card_num, card_num2, succ ? "supported" : "unsupported"); /* create a context */ EXIT_ON_ERROR(libze_ops.zeContextCreate(gpu.drv, &ctxt_desc, &gpu.ctxt)); /* create command queue and commad list */ cmdq_dev = reverse ? gpu.dev2 : gpu.dev; if (use_imm_cmdl) { EXIT_ON_ERROR(libze_ops.zeCommandListCreateImmediate(gpu.ctxt, cmdq_dev, &cq_desc, &gpu.cmdl)); } else { EXIT_ON_ERROR(libze_ops.zeCommandQueueCreate(gpu.ctxt, cmdq_dev, &cq_desc, &gpu.cmdq)); EXIT_ON_ERROR(libze_ops.zeCommandListCreate(gpu.ctxt, cmdq_dev, &cl_desc, &gpu.cmdl)); } } void finalize_gpu(void) { EXIT_ON_ERROR(libze_ops.zeCommandListDestroy(gpu.cmdl)); if (!use_imm_cmdl) EXIT_ON_ERROR(libze_ops.zeCommandQueueDestroy(gpu.cmdq)); } int get_buf_fd(void *buf) { ze_ipc_mem_handle_t ipc; memset(&ipc, 0, sizeof(ipc)); EXIT_ON_ERROR((libze_ops.zeMemGetIpcHandle(gpu.ctxt, buf, &ipc))); return (int)*(uint64_t *)&ipc; } void alloc_buffer(struct my_buf *buf, size_t size, ze_device_handle_t dev) { ze_device_mem_alloc_desc_t dev_desc = {}; ze_host_mem_alloc_desc_t host_desc = {}; switch (buf->where) { case MALLOC: EXIT_ON_ERROR(posix_memalign(&buf->buf, 4096, size)); if (import) EXIT_ON_ERROR((*zexDriverImportExternalPointer)(gpu.drv, buf->buf, size)); break; case HOST: EXIT_ON_ERROR(libze_ops.zeMemAllocHost(gpu.ctxt, &host_desc, size, 4096, &buf->buf)); break; case DEVICE: EXIT_ON_ERROR(libze_ops.zeMemAllocDevice(gpu.ctxt, &dev_desc, size, 4096, dev, &buf->buf)); break; default: EXIT_ON_ERROR(libze_ops.zeMemAllocShared(gpu.ctxt, &dev_desc, &host_desc, size, 4096, dev, &buf->buf)); break; } buf->size = size; buf->fd = -1; buf->map = NULL; buf->map_size = (size + 4095) & ~4095UL; if (buf->where == MALLOC) return; buf->fd = get_buf_fd(buf->buf); if (cache_mmap && copy_method == MMAP && (buf->where == DEVICE || mmap_all)) { buf->map = mmap(NULL, buf->map_size, PROT_READ | PROT_WRITE, MAP_SHARED, buf->fd, 0); if (buf->map == MAP_FAILED) { perror("mmap"); exit(-1); } } } void free_buffer(struct my_buf *buf) { if (buf->map) munmap(buf->map, buf->map_size); if (buf->fd != -1) close(buf->fd); if (buf->where == MALLOC && import) EXIT_ON_ERROR((*zexDriverReleaseImportedPointer)(gpu.drv, buf->buf)); if (buf->where == MALLOC) free(buf->buf); else EXIT_ON_ERROR(libze_ops.zeMemFree(gpu.ctxt, buf->buf)); } void *get_buf_ptr(struct my_buf *buf, size_t size) { if (buf->map) return buf->map; if (copy_method == MMAP && buf->where != MALLOC) { buf->map = mmap(NULL, buf->map_size, PROT_READ | PROT_WRITE, MAP_SHARED, buf->fd, 0); if (buf->map == MAP_FAILED) { perror("mmap"); exit(-1); } return buf->map; } return buf->buf; } void put_buf_ptr(struct my_buf *buf) { if (!cache_mmap && copy_method == MMAP && buf->where != MALLOC) { munmap(buf->map, buf->map_size); buf->map = NULL; } } void copy_buffer(struct my_buf *src_buf, struct my_buf *dst_buf, size_t size) { if (copy_method == MEMCPY) { memcpy(dst_buf->buf, src_buf->buf, size); } else if (copy_method == MMAP) { memcpy(get_buf_ptr(dst_buf, size), get_buf_ptr(src_buf, size), size); put_buf_ptr(src_buf); put_buf_ptr(dst_buf); } else if (copy_method == CMDQ) { if (use_imm_cmdl) { EXIT_ON_ERROR(libze_ops.zeCommandListAppendMemoryCopy( gpu.cmdl, dst_buf->buf, src_buf->buf, size, NULL, 0, NULL)); EXIT_ON_ERROR(libze_ops.zeCommandListReset(gpu.cmdl)); } else { if (!cache_cmdl) { EXIT_ON_ERROR(libze_ops.zeCommandListAppendMemoryCopy( gpu.cmdl, dst_buf->buf, src_buf->buf, size, NULL, 0, NULL)); EXIT_ON_ERROR(libze_ops.zeCommandListClose(gpu.cmdl)); } EXIT_ON_ERROR(libze_ops.zeCommandQueueExecuteCommandLists( gpu.cmdq, 1, &gpu.cmdl, NULL)); EXIT_ON_ERROR(libze_ops.zeCommandQueueSynchronize( gpu.cmdq, UINT32_MAX)); if (!cache_cmdl) EXIT_ON_ERROR(libze_ops.zeCommandListReset(gpu.cmdl)); } } } void fill_buffer(struct my_buf *buf, char c, size_t size) { void *mapped; if (buf->where != DEVICE) { memset(buf->buf, c, size); } else { mapped = mmap(NULL, buf->map_size, PROT_READ | PROT_WRITE, MAP_SHARED, buf->fd, 0); if (mapped == MAP_FAILED) { perror("mmap"); exit(-1); } memset(mapped, c, size); munmap(mapped, buf->map_size); } } void check_buffer(struct my_buf *buf, char c, size_t size) { char *p; int i; size_t errors = 0; if (buf->where != DEVICE) { p = buf->buf; for (i = 0; i< size; i++) if (p[i] != c) errors++; } else { p = mmap(NULL, buf->map_size, PROT_READ | PROT_WRITE, MAP_SHARED, buf->fd, 0); if (p == MAP_FAILED) { perror("mmap"); exit(-1); } for (i = 0; i< size; i++) if (p[i] != c) errors++; munmap(p, buf->map_size); } printf("%zd errors found\n", errors); } int str_to_location(char *s) { if (!strncasecmp(s, "m", 1)) return MALLOC; else if (!strncasecmp(s, "h", 1)) return HOST; else if (!strncasecmp(s, "d", 1)) return DEVICE; else return SHARED; } char *location_to_str(int location) { if (location == MALLOC) return "Host Memory (malloc)"; else if (location == HOST) return "Host Memory (ze)"; else if (location == DEVICE) return "Device Memory"; else return "Shared Memory"; } void run_test(struct my_buf *src_buf, struct my_buf *dst_buf) { size_t size; int i; double t1, t2; signal(SIGSEGV, segv_handler); if (setjmp(env)) return; fill_buffer(src_buf, 'a', MAX_MSG_SIZE); for (size = 1; size <= MAX_MSG_SIZE; size <<= 1) { if (cache_cmdl && copy_method == CMDQ && !use_imm_cmdl) { EXIT_ON_ERROR(libze_ops.zeCommandListAppendMemoryCopy( gpu.cmdl, dst_buf->buf, src_buf->buf, size, NULL, 0, NULL)); EXIT_ON_ERROR(libze_ops.zeCommandListClose(gpu.cmdl)); } t1 = when(); for (i = 0; i < iterations; i++) copy_buffer(src_buf, dst_buf, size); t2 = when(); if (cache_cmdl && copy_method == CMDQ && !use_imm_cmdl) EXIT_ON_ERROR(libze_ops.zeCommandListReset(gpu.cmdl)); printf("%8ld (x%d):%12.2lfus%12.2lfMB/s\n", size, iterations, t2 - t1, size * iterations / ((t2-t1))); } printf("Verifying data ......\n"); check_buffer(dst_buf, 'a', MAX_MSG_SIZE); } void usage(char *prog_name) { printf("Usage: %s [] \n", prog_name); printf("Options:\n"); printf("\t-n number of iterations to perform copy operation for each message size\n"); printf("\t-c method used for copy operations, can be 'cmdq' (default), 'memcpy', 'mmap', and 'mmap-device'\n"); printf("\t-C cache the command list, or cache mmap\n"); printf("\t-i use immediate command list\n"); printf("\t-d device to use (default: 0)\n"); printf("\t-D second device to use (default: the same as the first device)\n"); printf("\t-G command queue group ordinal (default: 0)\n"); printf("\t-I engine index within the command queue group (default: 0)\n"); printf("\t-r reverse the direction by creating command queue on the device with the destination buffer\n"); printf("\t-M import malloc'ed buffer into L0 before the copy\n"); printf("\t location of source buffer -- 'M':malloc, 'H':host, 'D':device, 'S':shared\n"); printf("\t location of destination buffer -- 'M':malloc, 'H':host, 'D':device, 'S':shared\n"); exit(1); } int main(int argc, char **argv) { struct my_buf src_buf = {}; struct my_buf dst_buf = {}; int c; while ((c = getopt(argc, argv, "n:c:Cid:D:G:I:rMh")) != -1) { switch (c) { case 'n': iterations = atoi(optarg); break; case 'c': if (!strcasecmp(optarg, "cmdq")) copy_method = CMDQ; else if (!strcasecmp(optarg, "memcpy")) copy_method = MEMCPY; else if (!strcasecmp(optarg, "mmap")) copy_method = MMAP, mmap_all = 1; else if (!strcasecmp(optarg, "mmap-device")) copy_method = MMAP, mmap_all = 0; else printf("Invalid copy method '%s', using default (cmdq).\n", optarg); break; case 'C': cache_cmdl = cache_mmap = 1; break; case 'i': use_imm_cmdl = 1; break; case 'd': card_num = atoi(optarg); break; case 'D': card_num2 = atoi(optarg); break; case 'G': ordinal = atoi(optarg); break; case 'I': engine_index = atoi(optarg); break; case 'r': reverse = 1; break; case 'M': import = 1; break; default: usage(argv[0]); break; } } if (argc < optind + 2) usage(argv[0]); src_buf.where = str_to_location(argv[optind]); dst_buf.where = str_to_location(argv[optind + 1]); if (card_num2 == 0xFF || src_buf.where == HOST || dst_buf.where == HOST) card_num2 = card_num; init_gpu(); alloc_buffer(&src_buf, MAX_MSG_SIZE, gpu.dev); alloc_buffer(&dst_buf, MAX_MSG_SIZE, gpu.dev2); printf("Copy from %s to %s, ", location_to_str(src_buf.where), location_to_str(dst_buf.where)); if (copy_method == MEMCPY) printf("using memcpy\n"); else if (copy_method == MMAP) printf("using mmap (%s) on %s\n", cache_mmap ? "cached" : "non-cached", mmap_all ? "all memory except malloc" : "device memory"); else printf("using %s command list (%s)\n", use_imm_cmdl ? "immediate" : "regular", cache_cmdl ? "cached" : "non-cached"); printf("Import external pointers: %s\n", import ? "yes" : "no"); run_test(&src_buf, &dst_buf); free_buffer(&src_buf); free_buffer(&dst_buf); finalize_gpu(); return 0; }