From 5b795bc1e11e2c9f22f17974d971be084d96096c Mon Sep 17 00:00:00 2001 From: John Tramm Date: Wed, 18 Nov 2020 16:06:56 +0000 Subject: [PATCH] added ability for user to select OpenCL platform and device IDs. --- opencl/CLutils.c | 196 ---------------- opencl/Makefile | 8 +- opencl/cl_utils.c | 352 ++++++++++++++++++++++++++++ opencl/cl_utils.h | 19 ++ opencl/io.c | 499 +++++++++++++++++++++------------------- opencl/kernel.cl | 2 + opencl/main.c | 3 - opencl/rsbench.h | 6 +- opencl/simulation.c | 396 ++++++++++++++++--------------- openmp-offload/Makefile | 12 +- 10 files changed, 851 insertions(+), 642 deletions(-) delete mode 100644 opencl/CLutils.c create mode 100644 opencl/cl_utils.c create mode 100644 opencl/cl_utils.h diff --git a/opencl/CLutils.c b/opencl/CLutils.c deleted file mode 100644 index a02da6e..0000000 --- a/opencl/CLutils.c +++ /dev/null @@ -1,196 +0,0 @@ -#include "rsbench.h" - -const char *getErrorString(cl_int error) -{ -switch(error){ - // run-time and JIT compiler errors - case 0: return "CL_SUCCESS"; - case -1: return "CL_DEVICE_NOT_FOUND"; - case -2: return "CL_DEVICE_NOT_AVAILABLE"; - case -3: return "CL_COMPILER_NOT_AVAILABLE"; - case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case -5: return "CL_OUT_OF_RESOURCES"; - case -6: return "CL_OUT_OF_HOST_MEMORY"; - case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case -8: return "CL_MEM_COPY_OVERLAP"; - case -9: return "CL_IMAGE_FORMAT_MISMATCH"; - case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case -11: return "CL_BUILD_PROGRAM_FAILURE"; - case -12: return "CL_MAP_FAILURE"; - case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; - case -15: return "CL_COMPILE_PROGRAM_FAILURE"; - case -16: return "CL_LINKER_NOT_AVAILABLE"; - case -17: return "CL_LINK_PROGRAM_FAILURE"; - case -18: return "CL_DEVICE_PARTITION_FAILED"; - case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; - - // compile-time errors - case -30: return "CL_INVALID_VALUE"; - case -31: return "CL_INVALID_DEVICE_TYPE"; - case -32: return "CL_INVALID_PLATFORM"; - case -33: return "CL_INVALID_DEVICE"; - case -34: return "CL_INVALID_CONTEXT"; - case -35: return "CL_INVALID_QUEUE_PROPERTIES"; - case -36: return "CL_INVALID_COMMAND_QUEUE"; - case -37: return "CL_INVALID_HOST_PTR"; - case -38: return "CL_INVALID_MEM_OBJECT"; - case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case -40: return "CL_INVALID_IMAGE_SIZE"; - case -41: return "CL_INVALID_SAMPLER"; - case -42: return "CL_INVALID_BINARY"; - case -43: return "CL_INVALID_BUILD_OPTIONS"; - case -44: return "CL_INVALID_PROGRAM"; - case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; - case -46: return "CL_INVALID_KERNEL_NAME"; - case -47: return "CL_INVALID_KERNEL_DEFINITION"; - case -48: return "CL_INVALID_KERNEL"; - case -49: return "CL_INVALID_ARG_INDEX"; - case -50: return "CL_INVALID_ARG_VALUE"; - case -51: return "CL_INVALID_ARG_SIZE"; - case -52: return "CL_INVALID_KERNEL_ARGS"; - case -53: return "CL_INVALID_WORK_DIMENSION"; - case -54: return "CL_INVALID_WORK_GROUP_SIZE"; - case -55: return "CL_INVALID_WORK_ITEM_SIZE"; - case -56: return "CL_INVALID_GLOBAL_OFFSET"; - case -57: return "CL_INVALID_EVENT_WAIT_LIST"; - case -58: return "CL_INVALID_EVENT"; - case -59: return "CL_INVALID_OPERATION"; - case -60: return "CL_INVALID_GL_OBJECT"; - case -61: return "CL_INVALID_BUFFER_SIZE"; - case -62: return "CL_INVALID_MIP_LEVEL"; - case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; - case -64: return "CL_INVALID_PROPERTY"; - case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; - case -66: return "CL_INVALID_COMPILER_OPTIONS"; - case -67: return "CL_INVALID_LINKER_OPTIONS"; - case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; - - // extension errors - case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; - case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; - case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; - case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; - case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; - case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; - default: return "Unknown OpenCL error"; - } -} - -void check(cl_int error) -{ - if( error != 0 ) - printf("%s\n", getErrorString(error)); -} - -void printCompilerError( cl_program program, cl_device_id device ) -{ - cl_int status; - - size_t logSize; - char * log; - - status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); - - check( status ); - - log = (char *) malloc(logSize); - if( !log) { - exit(-1); - } - - status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL); - check(status); - - printf("%s\n", log); -} - -void print_single_info( cl_platform_id platform, cl_device_id device) -{ - char* value; - size_t valueSize; - cl_uint maxComputeUnits; - // print device name - clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL); - printf("Device: %s\n", value); - free(value); - // print hardware device version - clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(device, CL_DEVICE_VERSION, valueSize, value, NULL); - printf(" %d Hardware version: %s\n", 1, value); - free(value); - // print software driver version - clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(device, CL_DRIVER_VERSION, valueSize, value, NULL); - printf(" %d Software version: %s\n", 2, value); - free(value); - // print c version supported by compiler for device - clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL); - printf(" %d OpenCL C version: %s\n", 3, value); - free(value); - // print parallel compute units - clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(maxComputeUnits), &maxComputeUnits, NULL); - printf(" %d Parallel compute units: %d\n", 4, maxComputeUnits); -} - -void print_opencl_info(void) -{ - int i, j; - char* value; - size_t valueSize; - cl_uint platformCount; - cl_platform_id* platforms; - cl_uint deviceCount; - cl_device_id* devices; - cl_uint maxComputeUnits; - // get all platforms - clGetPlatformIDs(0, NULL, &platformCount); - platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); - clGetPlatformIDs(platformCount, platforms, NULL); - for (i = 0; i < platformCount; i++) { - // get all devices - clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); - devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount); - clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL); - // for each device print critical attributes - for (j = 0; j < deviceCount; j++) { - // print device name - clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL); - printf("%d. Device: %s\n", j+1, value); - free(value); - // print hardware device version - clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL); - printf(" %d.%d Hardware version: %s\n", j+1, 1, value); - free(value); - // print software driver version - clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL); - printf(" %d.%d Software version: %s\n", j+1, 2, value); - free(value); - // print c version supported by compiler for device - clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); - value = (char*) malloc(valueSize); - clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL); - printf(" %d.%d OpenCL C version: %s\n", j+1, 3, value); - free(value); - // print parallel compute units - clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(maxComputeUnits), &maxComputeUnits, NULL); - printf(" %d.%d Parallel compute units: %d\n", j+1, 4, maxComputeUnits); - } - free(devices); - } - free(platforms); -} diff --git a/opencl/Makefile b/opencl/Makefile index 98162c7..59f212a 100644 --- a/opencl/Makefile +++ b/opencl/Makefile @@ -20,7 +20,7 @@ io.c \ init.c \ material.c \ utils.c\ -CLutils.c +cl_utils.c obj = $(source:.c=.o) @@ -61,17 +61,17 @@ endif # Targets to Build #=============================================================================== -$(program): $(obj) rsbench.h Makefile +$(program): $(obj) rsbench.h cl_utils.h Makefile $(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS) -%.o: %.c rsbench.h Makefile +%.o: %.c rsbench.h cl_utils.h Makefile $(CC) $(CFLAGS) -c $< -o $@ clean: rm -rf rsbench $(obj) edit: - vim -p $(source) kernel.cl rsbench.h + vim -p $(source) kernel.cl rsbench.h cl_utils.h run: ./rsbench diff --git a/opencl/cl_utils.c b/opencl/cl_utils.c new file mode 100644 index 0000000..f482fb0 --- /dev/null +++ b/opencl/cl_utils.c @@ -0,0 +1,352 @@ +#include "rsbench.h" + +const char *getErrorString(cl_int error) +{ + switch(error){ + // run-time and JIT compiler errors + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // compile-time errors + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + + // extension errors + case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + default: return "Unknown OpenCL error"; + } +} + +void check(cl_int error) +{ + if( error != 0 ) + printf("%s\n", getErrorString(error)); +} + +void printCompilerError( cl_program program, cl_device_id device ) +{ + cl_int status; + + size_t logSize; + char * log; + + status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); + + check( status ); + + log = (char *) malloc(logSize); + if( !log) { + exit(-1); + } + + status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL); + check(status); + + if( strlen(log) > 0 ) + printf("%s\n", log); +} + +void print_single_info( cl_platform_id platform, cl_device_id device) +{ + char* value; + size_t valueSize; + cl_uint maxComputeUnits; + // print device name + clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL); + printf("Device: %s\n", value); + free(value); + // print hardware device version + clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(device, CL_DEVICE_VERSION, valueSize, value, NULL); + printf(" %d Hardware version: %s\n", 1, value); + free(value); + // print software driver version + clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(device, CL_DRIVER_VERSION, valueSize, value, NULL); + printf(" %d Software version: %s\n", 2, value); + free(value); + // print c version supported by compiler for device + clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL); + printf(" %d OpenCL C version: %s\n", 3, value); + free(value); + // print parallel compute units + clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(maxComputeUnits), &maxComputeUnits, NULL); + printf(" %d Parallel compute units: %d\n", 4, maxComputeUnits); +} + +void print_opencl_info(void) +{ + int i, j; + char* value; + size_t valueSize; + cl_uint platformCount; + cl_platform_id* platforms; + cl_uint deviceCount; + cl_device_id* devices; + cl_uint maxComputeUnits; + // get all platforms + clGetPlatformIDs(0, NULL, &platformCount); + printf("Number of platforms detected = %d\n", platformCount); + platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); + clGetPlatformIDs(platformCount, platforms, NULL); + for (i = 0; i < platformCount; i++) { + printf("Platorm %d:\n", i); + // get all devices + clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); + devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount); + clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL); + // for each device print critical attributes + for (j = 0; j < deviceCount; j++) { + // print device name + clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL); + printf("\tDevice %d: %s\n", j+1, value); + free(value); + // print hardware device version + clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL); + printf("\t\t%d.%d Hardware version: %s\n", j+1, 1, value); + free(value); + // print software driver version + clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL); + printf("\t\t%d.%d Software version: %s\n", j+1, 2, value); + free(value); + // print c version supported by compiler for device + clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); + value = (char*) malloc(valueSize); + clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL); + printf("\t\t%d.%d OpenCL C version: %s\n", j+1, 3, value); + free(value); + // print parallel compute units + clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(maxComputeUnits), &maxComputeUnits, NULL); + printf("\t\t%d.%d Parallel compute units: %d\n", j+1, 4, maxComputeUnits); + } + free(devices); + } + free(platforms); +} + +OpenCLInfo initialize_device(int user_platform_id, int user_device_id) +{ + border_print(); + center_print("AVAILABLE OPENCL PLATFORMS & DEVICES", 79); + border_print(); + print_opencl_info(); + border_print(); + + center_print("DEVICE INITIALIZATION", 79); + border_print(); + + // Get platform and device information + /* + cl_platform_id platform_id = NULL; + cl_device_id device_id = NULL; + cl_uint ret_num_devices; + cl_uint ret_num_platforms; + cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); + check(ret); + ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); + check(ret); + */ + + int platform_idx = 0; + if( user_platform_id != -1 ) + platform_idx = user_platform_id; + + // Get # of Platforms + cl_uint ret_num_platforms; + cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms); + check(ret); + + // Allocate space for platform information + cl_platform_id * platform_ids = (cl_platform_id *) malloc(ret_num_platforms * sizeof(cl_platform_id)); + + // Fill in data on Platforms + ret = clGetPlatformIDs(ret_num_platforms, platform_ids, NULL); + + cl_platform_id target_platform_id = platform_ids[platform_idx]; // '1' is the target platform index. needs to be changed accordingly. + + int device_idx = CL_DEVICE_TYPE_DEFAULT; + if( user_device_id != -1 ) + device_idx = user_device_id; + + cl_uint ret_num_devices; + cl_device_id device_id = NULL; + ret = clGetDeviceIDs( target_platform_id, device_idx, 1, &device_id, &ret_num_devices); + + // Print info about where we are running + printf("Selected Device (platform id = %d, device id = %d)\n", platform_idx, device_idx); + print_single_info(target_platform_id, device_id); + printf("Note: platform ID can be specified with the \"-P\" flag and device id with the \"-D\" flag\n"); + + // Create an OpenCL context + cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); + check(ret); + + // Create a command queue + //cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); + cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); + check(ret); + + OpenCLInfo CL; + CL.platform_id = target_platform_id; + CL.device_id = device_id; + CL.context = context; + CL.command_queue = command_queue; + return CL; +} + +cl_mem copy_array_to_device(OpenCLInfo * CL, cl_mem_flags mem_flags, void * array, size_t sz) +{ + cl_int ret; + cl_mem d_array = clCreateBuffer(CL->context, mem_flags, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(CL->command_queue, d_array, CL_TRUE, 0, sz, array, 0, NULL, NULL); + check(ret); + return d_array; +} + +void copy_array_from_device(OpenCLInfo * CL, cl_mem * d_array, void * h_array, size_t sz) +{ + cl_int ret; + ret = clEnqueueReadBuffer(CL->command_queue, *d_array, CL_TRUE, 0, sz, h_array, 0, NULL, NULL); + check(ret); +} + +void set_kernel_arguments(cl_kernel * kernel, int argc, size_t * arg_sz, void ** args) +{ + cl_int ret; + for( int i = 0; i < argc; i++ ) + { + ret = clSetKernelArg(*kernel, i, arg_sz[i], args[i]); + check(ret); + } +} + +cl_kernel compile_kernel(OpenCLInfo * CL, char * kernel_name) +{ + printf("Compiling %s...\n", kernel_name); + + // Load the kernel source code into the array source_str + FILE *fp; + char *source_str; + size_t source_size; + char kernel_fname[256]; + strcpy(kernel_fname, kernel_name); + strcat(kernel_fname, ".cl"); + + fp = fopen(kernel_fname, "r"); + if (!fp) { + fprintf(stderr, "Failed to load kernel.\n"); + exit(1); + } + source_str = (char*) malloc(MAX_SOURCE_SIZE); + source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); + assert(source_size > 0 ); + fclose( fp ); + + // Create a program from the kernel source + cl_int ret; + cl_program program = clCreateProgramWithSource(CL->context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); + check(ret); + + // Build the program + ret = clBuildProgram(program, 1, &CL->device_id, NULL, NULL, NULL); + check(ret); + + printCompilerError( program, CL->device_id ); + + // Create the OpenCL kernel + cl_kernel kernel = clCreateKernel(program, kernel_name, &ret); + check(ret); + + return kernel; +} + +void clear_array(OpenCLInfo * CL, cl_mem * buffer, size_t sz) +{ + float fill = 0.0; + cl_int ret = clEnqueueFillBuffer( + CL->command_queue, + *buffer, + (void *) &fill, + sizeof(float), + 0, + sz, + 0, + NULL, + NULL); + check(ret); +} diff --git a/opencl/cl_utils.h b/opencl/cl_utils.h new file mode 100644 index 0000000..a3402c8 --- /dev/null +++ b/opencl/cl_utils.h @@ -0,0 +1,19 @@ + +typedef struct{ + cl_platform_id platform_id; + cl_device_id device_id; + cl_context context; + cl_command_queue command_queue; +} OpenCLInfo; + +const char *getErrorString(cl_int error); +void check(cl_int error); +void printCompilerError( cl_program program, cl_device_id device ); +void print_single_info( cl_platform_id platform, cl_device_id device); +void print_opencl_info(void); +OpenCLInfo initialize_device(int user_platform_id, int user_device_id); +cl_mem copy_array_to_device(OpenCLInfo * CL, cl_mem_flags mem_flags, void * array, size_t sz); +void copy_array_from_device(OpenCLInfo * CL, cl_mem * d_array, void * h_array, size_t sz); +void set_kernel_arguments(cl_kernel * kernel, int argc, size_t * arg_sz, void ** args); +cl_kernel compile_kernel(OpenCLInfo * CL, char * kernel_name); +void clear_array(OpenCLInfo * CL, cl_mem * buffer, size_t sz); diff --git a/opencl/io.c b/opencl/io.c index bb99aaa..771e562 100644 --- a/opencl/io.c +++ b/opencl/io.c @@ -87,252 +87,285 @@ Input read_CLI( int argc, char * argv[] ) input.doppler = 1; // defaults to baseline simulation kernel input.kernel_id = 0; - - int default_lookups = 1; - int default_particles = 1; - - // Collect Raw Input - for( int i = 1; i < argc; i++ ) - { - char * arg = argv[i]; - - // Simulation Method (-m) - if( strcmp(arg, "-m") == 0 ) - { - char * sim_type = NULL; - if( ++i < argc ) - sim_type = argv[i]; - else - print_CLI_error(); - - if( strcmp(sim_type, "history") == 0 ) - input.simulation_method = HISTORY_BASED; - else if( strcmp(sim_type, "event") == 0 ) - { - input.simulation_method = EVENT_BASED; - // Also resets default # of lookups - if( default_lookups && default_particles ) - { - input.lookups = input.lookups * input.particles; - input.particles = 0; - } - } - else - print_CLI_error(); - } - // lookups (-l) - else if( strcmp(arg, "-l") == 0 ) - { - if( ++i < argc ) - { - input.lookups = atoi(argv[i]); - default_lookups = 0; - } - else - print_CLI_error(); - } - // particles (-p) - else if( strcmp(arg, "-p") == 0 ) - { - if( ++i < argc ) - { - input.particles = atoi(argv[i]); - default_particles = 0; - } - else - print_CLI_error(); - } - // nuclides (-n) - else if( strcmp(arg, "-n") == 0 ) - { - if( ++i < argc ) - input.n_nuclides = atoi(argv[i]); - else - print_CLI_error(); - } - // HM (-s) - else if( strcmp(arg, "-s") == 0 ) - { - if( ++i < argc ) - { - if( strcmp(argv[i], "small") == 0 ) - input.HM = SMALL; - else if ( strcmp(argv[i], "large") == 0 ) - input.HM = LARGE; - else - print_CLI_error(); - } - else - print_CLI_error(); - } - // Doppler Broadening (Temperature Dependence) - else if( strcmp(arg, "-d") == 0 ) - { - input.doppler = 0; - } - // Avg number of windows per nuclide (-w) - else if( strcmp(arg, "-W") == 0 ) - { - if( ++i < argc ) - input.avg_n_windows = atoi(argv[i]); - else - print_CLI_error(); - } - // Avg number of poles per nuclide (-p) - else if( strcmp(arg, "-P") == 0 ) - { - if( ++i < argc ) - input.avg_n_poles = atoi(argv[i]); - else - print_CLI_error(); - } - // Kernel ID (-k) - else if( strcmp(arg, "-k") == 0 ) - { - if( ++i < argc ) - input.kernel_id = atoi(argv[i]); - else - print_CLI_error(); - } - else - print_CLI_error(); - } - // Validate Input - - // Validate nthreads - if( input.nthreads < 1 ) - print_CLI_error(); - - // Validate n_isotopes - if( input.n_nuclides < 1 ) - print_CLI_error(); - - // Validate lookups - if( input.lookups < 1 ) - print_CLI_error(); - - // Validate lookups - if( input.avg_n_poles < 1 ) - print_CLI_error(); - - // Validate lookups - if( input.avg_n_windows < 1 ) - print_CLI_error(); - - // Set HM size specific parameters - // (defaults to large) - if( input.HM == SMALL ) - input.n_nuclides = 68; - - // Return input struct - return input; + input.platform_id = -1; + input.device_id = -1; + + int default_lookups = 1; + int default_particles = 1; + + // Collect Raw Input + for( int i = 1; i < argc; i++ ) + { + char * arg = argv[i]; + + // Simulation Method (-m) + if( strcmp(arg, "-m") == 0 ) + { + char * sim_type = NULL; + if( ++i < argc ) + sim_type = argv[i]; + else + print_CLI_error(); + + if( strcmp(sim_type, "history") == 0 ) + input.simulation_method = HISTORY_BASED; + else if( strcmp(sim_type, "event") == 0 ) + { + input.simulation_method = EVENT_BASED; + // Also resets default # of lookups + if( default_lookups && default_particles ) + { + input.lookups = input.lookups * input.particles; + input.particles = 0; + } + } + else + print_CLI_error(); + } + // lookups (-l) + else if( strcmp(arg, "-l") == 0 ) + { + if( ++i < argc ) + { + input.lookups = atoi(argv[i]); + default_lookups = 0; + } + else + print_CLI_error(); + } + // particles (-p) + else if( strcmp(arg, "-p") == 0 ) + { + if( ++i < argc ) + { + input.particles = atoi(argv[i]); + default_particles = 0; + } + else + print_CLI_error(); + } + // nuclides (-n) + else if( strcmp(arg, "-n") == 0 ) + { + if( ++i < argc ) + input.n_nuclides = atoi(argv[i]); + else + print_CLI_error(); + } + // HM (-s) + else if( strcmp(arg, "-s") == 0 ) + { + if( ++i < argc ) + { + if( strcmp(argv[i], "small") == 0 ) + input.HM = SMALL; + else if ( strcmp(argv[i], "large") == 0 ) + input.HM = LARGE; + else + print_CLI_error(); + } + else + print_CLI_error(); + } + // Doppler Broadening (Temperature Dependence) + else if( strcmp(arg, "-d") == 0 ) + { + input.doppler = 0; + } + // Avg number of windows per nuclide (-w) + else if( strcmp(arg, "-W") == 0 ) + { + if( ++i < argc ) + input.avg_n_windows = atoi(argv[i]); + else + print_CLI_error(); + } + // Avg number of poles per nuclide (-p) + else if( strcmp(arg, "-p") == 0 ) + { + if( ++i < argc ) + input.avg_n_poles = atoi(argv[i]); + else + print_CLI_error(); + } + // Kernel ID (-k) + else if( strcmp(arg, "-k") == 0 ) + { + if( ++i < argc ) + input.kernel_id = atoi(argv[i]); + else + print_CLI_error(); + } + // Platform selection (-P) + else if( strcmp(arg, "-P") == 0 ) + { + if( ++i < argc ) + { + input.platform_id = atoi(argv[i]); + printf("PLATFORM ID DETECTED = %d\n", input.platform_id); + } + else + print_CLI_error(); + } + // Device selection (-D) + else if( strcmp(arg, "-D") == 0 ) + { + if( ++i < argc ) + input.device_id = atoi(argv[i]); + else + print_CLI_error(); + } + else + print_CLI_error(); + } + + // Validate Input + + // Validate nthreads + if( input.nthreads < 1 ) + print_CLI_error(); + + // Validate n_isotopes + if( input.n_nuclides < 1 ) + print_CLI_error(); + + // Validate lookups + if( input.lookups < 1 ) + print_CLI_error(); + + // Validate lookups + if( input.avg_n_poles < 1 ) + print_CLI_error(); + + // Validate lookups + if( input.avg_n_windows < 1 ) + print_CLI_error(); + + // Set HM size specific parameters + // (defaults to large) + if( input.HM == SMALL ) + input.n_nuclides = 68; + + // Return input struct + return input; } void print_CLI_error(void) { - printf("Usage: ./multibench \n"); - printf("Options include:\n"); - printf(" -s Size of H-M Benchmark to run (small, large)\n"); - printf(" -l Number of Cross-section (XS) lookups per particle history\n"); - printf(" -p Number of particle histories\n"); - printf(" -P Average Number of Poles per Nuclide\n"); - printf(" -W Average Number of Windows per Nuclide\n"); - printf(" -d Disables Temperature Dependence (Doppler Broadening)\n"); - printf("Default is equivalent to: -s large -l 34 -p 300000 -P 1000 -W 100\n"); - printf("See readme for full description of default run values\n"); - exit(4); + printf("Usage: ./multibench \n"); + printf("Options include:\n"); + printf(" -s Size of H-M Benchmark to run (small, large)\n"); + printf(" -l Number of Cross-section (XS) lookups per particle history\n"); + printf(" -p Number of particle histories\n"); + printf(" -P Average Number of Poles per Nuclide\n"); + printf(" -W Average Number of Windows per Nuclide\n"); + printf(" -d Disables Temperature Dependence (Doppler Broadening)\n"); + printf(" -P Manually specify the OpenCL platform id to run on\n"); + printf(" -D Manually specify the OpenCL device id to run on\n"); + printf("Default is equivalent to: -s large -l 34 -p 300000 -P 1000 -W 100\n"); + printf("See readme for full description of default run values\n"); + exit(4); } void print_input_summary(Input input) { - // Calculate Estimate of Memory Usage - size_t mem = get_mem_estimate(input); - - printf("Programming Model: OpenCL\n"); - if( input.simulation_method == EVENT_BASED ) - printf("Simulation Method: Event Based\n"); - else - printf("Simulation Method: History Based\n"); - printf("Materials: 12\n"); - printf("H-M Benchmark Size: "); - if( input.HM == 0 ) - printf("Small\n"); - else - printf("Large\n"); - if( input.doppler == 1 ) - printf("Temperature Dependence: ON\n"); - else - printf("Temperature Dependence: OFF\n"); - printf("Total Nuclides: %d\n", input.n_nuclides); - printf("Avg Poles per Nuclide: "); fancy_int(input.avg_n_poles); - printf("Avg Windows per Nuclide: "); fancy_int(input.avg_n_windows); - - int lookups = input.lookups; - if( input.simulation_method == HISTORY_BASED ) - { - printf("Particles: "); fancy_int(input.particles); - printf("XS Lookups per Particle: "); fancy_int(input.lookups); - lookups *= input.particles; - } - printf("Total XS Lookups: "); fancy_int(lookups); - printf("Est. Memory Usage (MB): %.1lf\n", mem / 1024.0 / 1024.0); + // Calculate Estimate of Memory Usage + size_t mem = get_mem_estimate(input); + + printf("Programming Model: OpenCL\n"); + if( input.simulation_method == EVENT_BASED ) + printf("Simulation Method: Event Based\n"); + else + printf("Simulation Method: History Based\n"); + printf("Materials: 12\n"); + printf("H-M Benchmark Size: "); + if( input.HM == 0 ) + printf("Small\n"); + else + printf("Large\n"); + if( input.doppler == 1 ) + printf("Temperature Dependence: ON\n"); + else + printf("Temperature Dependence: OFF\n"); + printf("Total Nuclides: %d\n", input.n_nuclides); + printf("Avg Poles per Nuclide: "); fancy_int(input.avg_n_poles); + printf("Avg Windows per Nuclide: "); fancy_int(input.avg_n_windows); + + int lookups = input.lookups; + if( input.simulation_method == HISTORY_BASED ) + { + printf("Particles: "); fancy_int(input.particles); + printf("XS Lookups per Particle: "); fancy_int(input.lookups); + lookups *= input.particles; + } + printf("Total XS Lookups: "); fancy_int(lookups); + printf("Est. Memory Usage (MB): %.1lf\n", mem / 1024.0 / 1024.0); + if( input.platform_id == -1 ) + printf("OpenCL Platform ID: Default\n"); + else + printf("OpenCL Platform ID: %d\n", input.platform_id); + + if( input.device_id == -1 ) + printf("OpenCL Device ID: Default\n"); + else + printf("OpenCL Device ID: %d\n", input.device_id); } int validate_and_print_results(Input input, double runtime, unsigned long vhash, double sim_runtime) { - int lookups = 0; - if( input.simulation_method == HISTORY_BASED ) - lookups = input.lookups*input.particles; - else - lookups = input.lookups; - int lookups_per_sec = (int) ((double) lookups / runtime); - int sim_only_lookups_per_sec = (int) ((double) lookups/ sim_runtime); - printf("Total Time Statistics (OpenCL Init / JIT Compilation + Simulation Kernel)\n"); - printf("Runtime: %.3lf seconds\n", runtime); - printf("Lookups: "); fancy_int(lookups); - printf("Lookups/s: "); - fancy_int(lookups_per_sec); - printf("Simulation Kernel Only Statistics\n"); - printf("Runtime: %.3lf seconds\n", sim_runtime); - printf("Lookups/s: "); - fancy_int(sim_only_lookups_per_sec); - - int is_invalid = 1; - - unsigned long long large = 0; - unsigned long long small = 0; - if(input.simulation_method == HISTORY_BASED ) - { - large = 351485; - small = 879693; - } - else if( input.simulation_method == EVENT_BASED ) - { - large = 358389; - small = 880018; - } + int lookups = 0; + if( input.simulation_method == HISTORY_BASED ) + lookups = input.lookups*input.particles; + else + lookups = input.lookups; + int lookups_per_sec = (int) ((double) lookups / runtime); + int sim_only_lookups_per_sec = (int) ((double) lookups/ sim_runtime); + printf("Total Time Statistics (OpenCL Init / JIT Compilation + Simulation Kernel)\n"); + printf("Runtime: %.3lf seconds\n", runtime); + printf("Lookups: "); fancy_int(lookups); + printf("Lookups/s: "); + fancy_int(lookups_per_sec); + printf("Simulation Kernel Only Statistics\n"); + printf("Runtime: %.3lf seconds\n", sim_runtime); + printf("Lookups/s: "); + fancy_int(sim_only_lookups_per_sec); - if( input.HM == LARGE ) - { - if( vhash == large ) - { - printf("Verification checksum: %lu (Valid)\n", vhash); - is_invalid = 0; - } - else - printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash); - } - else if( input.HM == SMALL ) - { - if( vhash == small ) - { - printf("Verification checksum: %lu (Valid)\n", vhash); - is_invalid = 0; - } - else - printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash); - } + int is_invalid = 1; + + unsigned long long large = 0; + unsigned long long small = 0; + if(input.simulation_method == HISTORY_BASED ) + { + large = 351485; + small = 879693; + } + else if( input.simulation_method == EVENT_BASED ) + { + large = 358389; + small = 880018; + } + + if( input.HM == LARGE ) + { + if( vhash == large ) + { + printf("Verification checksum: %lu (Valid)\n", vhash); + is_invalid = 0; + } + else + printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash); + } + else if( input.HM == SMALL ) + { + if( vhash == small ) + { + printf("Verification checksum: %lu (Valid)\n", vhash); + is_invalid = 0; + } + else + printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash); + } - return is_invalid; + return is_invalid; } diff --git a/opencl/kernel.cl b/opencl/kernel.cl index c9d9174..e7458d0 100644 --- a/opencl/kernel.cl +++ b/opencl/kernel.cl @@ -25,6 +25,8 @@ typedef struct{ int particles; int simulation_method; int kernel_id; + int platform_id; + int device_id; } Input; typedef struct{ diff --git a/opencl/main.c b/opencl/main.c index 658e6b0..deea25a 100644 --- a/opencl/main.c +++ b/opencl/main.c @@ -38,9 +38,6 @@ int main(int argc, char * argv[]) // ===================================================================== // Cross Section (XS) Parallel Lookup Simulation Begins // ===================================================================== - border_print(); - center_print("SIMULATION", 79); - border_print(); unsigned long vhash = 0; diff --git a/opencl/rsbench.h b/opencl/rsbench.h index 664cb3e..1aa9b50 100644 --- a/opencl/rsbench.h +++ b/opencl/rsbench.h @@ -7,10 +7,12 @@ #include #include #include -#define CL_TARGET_OPENCL_VERSION 200 +#define CL_TARGET_OPENCL_VERSION 120 #include #define MAX_SOURCE_SIZE (0x100000) +#include "cl_utils.h" + #define PI 3.14159265359 // typedefs @@ -39,6 +41,8 @@ typedef struct{ int particles; int simulation_method; int kernel_id; + int platform_id; + int device_id; } Input; typedef struct{ diff --git a/opencl/simulation.c b/opencl/simulation.c index 8bec817..c2607c3 100644 --- a/opencl/simulation.c +++ b/opencl/simulation.c @@ -14,7 +14,6 @@ unsigned long long run_event_based_simulation(Input in, SimulationData SD, double * sim_runtime) { - printf("Initializing OpenCL data structures and JIT compiling kernel...\n"); double start = get_time(); @@ -39,219 +38,208 @@ unsigned long long run_event_based_simulation(Input in, SimulationData SD, doubl source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); - // Get platform and device information - cl_platform_id platform_id = NULL; - cl_device_id device_id = NULL; - cl_uint ret_num_devices; - cl_uint ret_num_platforms; - cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); - check(ret); - ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); - //ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); - //ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); - check(ret); - - // Print info about where we are running - print_single_info(platform_id, device_id); - - // Create an OpenCL context - cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); - check(ret); - - // Create a command queue - cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); - check(ret); - - //////////////////////////////////////////////////////////////////////////////// - // OpenCL Move Memory To Device Buffers - //////////////////////////////////////////////////////////////////////////////// - - // Create memory buffers on the device for each vector and move data over - size_t sz = SD.length_num_nucs * sizeof(int); - cl_mem num_nucs_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, num_nucs_d, CL_TRUE, 0, sz, SD.num_nucs, 0, NULL, NULL); - check(ret); - - sz = SD.length_concs * sizeof(double); - cl_mem concs_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, concs_d, CL_TRUE, 0, sz, SD.concs, 0, NULL, NULL); - check(ret); - - sz = SD.length_mats * sizeof(int); - cl_mem mats_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, mats_d, CL_TRUE, 0, sz, SD.mats, 0, NULL, NULL); - check(ret); - - sz = in.lookups * sizeof(int); - cl_mem verification_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, &ret); - check(ret); - - sz = SD.length_n_windows * sizeof(int); - cl_mem n_windows_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, n_windows_d, CL_TRUE, 0, sz, SD.n_windows, 0, NULL, NULL); - check(ret); - - sz = SD.length_poles * sizeof(Pole); - cl_mem poles_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, poles_d, CL_TRUE, 0, sz, SD.poles, 0, NULL, NULL); - check(ret); - - sz = SD.length_windows * sizeof(Window); - cl_mem windows_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, windows_d, CL_TRUE, 0, sz, SD.windows, 0, NULL, NULL); - check(ret); - - sz = SD.length_pseudo_K0RS * sizeof(double); - cl_mem pseudo_K0RS_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); - check(ret); - ret = clEnqueueWriteBuffer(command_queue, pseudo_K0RS_d, CL_TRUE, 0, sz, SD.pseudo_K0RS, 0, NULL, NULL); - check(ret); - - //////////////////////////////////////////////////////////////////////////////// - // OpenCL Build and Intiailize Kernel Program - //////////////////////////////////////////////////////////////////////////////// - - // Create a program from the kernel source - cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); - check(ret); - - // Build the program - ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); - check(ret); - - printCompilerError( program, device_id ); - - // Create the OpenCL kernel - cl_kernel kernel = clCreateKernel(program, "macro_xs_lookup_kernel", &ret); - check(ret); - - // Set the arguments of the kernel - ret = clSetKernelArg(kernel, 0, sizeof(Input), (void *)&in); - check(ret); - ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&num_nucs_d); - check(ret); - ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&mats_d); - check(ret); - ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&SD.max_num_nucs); - check(ret); - ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&concs_d); - check(ret); - ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&n_windows_d); - check(ret); - ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&pseudo_K0RS_d); - check(ret); - ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&windows_d); - check(ret); - ret = clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&poles_d); - check(ret); - ret = clSetKernelArg(kernel, 9, sizeof(int), (void *)&SD.max_num_windows); - check(ret); - ret = clSetKernelArg(kernel, 10, sizeof(int), (void *)&SD.max_num_poles); - check(ret); - ret = clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&verification_array); - check(ret); - - double stop = get_time(); - printf("OpenCL initialization time: %.3lf seconds\n", stop-start); - start = stop; - - //////////////////////////////////////////////////////////////////////////////// - // Run Simulation Kernel - //////////////////////////////////////////////////////////////////////////////// - - printf("Running event based simulation...\n"); - - // Execute the OpenCL kernel on the list - size_t global_item_size = in.lookups; // Process the entire lists - size_t local_item_size = 8; // Divide work items into groups of 8 - ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); - check(ret); - - //////////////////////////////////////////////////////////////////////////////// - // Retrieve verification data from device and reduce it - //////////////////////////////////////////////////////////////////////////////// - - // Read the memory buffer C on the device to the local variable C - ret = clEnqueueReadBuffer(command_queue, verification_array, CL_TRUE, 0, in.lookups * sizeof(int), verification_array_host, 0, NULL, NULL); - check(ret); - - printf("Reducing verification value...\n"); - - unsigned long long verification = 0; - - for( int l = 0; l < in.lookups; l++ ) - verification += verification_array_host[l]; - - stop = get_time(); - *sim_runtime = stop-start; - printf("Simulation + Verification Reduction Runtime: %.3lf seconds\n", *sim_runtime); - - //////////////////////////////////////////////////////////////////////////////// - // OpenCL cleanup - //////////////////////////////////////////////////////////////////////////////// - - ret = clFlush(command_queue); - check(ret); - ret = clFinish(command_queue); - check(ret); - ret = clReleaseKernel(kernel); - check(ret); - ret = clReleaseProgram(program); - check(ret); - ret = clReleaseMemObject(num_nucs_d); - check(ret); - ret = clReleaseMemObject(mats_d); - check(ret); - ret = clReleaseMemObject(n_windows_d); - check(ret); - ret = clReleaseMemObject(poles_d); - check(ret); - ret = clReleaseMemObject(windows_d); - check(ret); - ret = clReleaseMemObject(pseudo_K0RS_d); - check(ret); - ret = clReleaseMemObject(verification_array); - check(ret); - ret = clReleaseCommandQueue(command_queue); - check(ret); - ret = clReleaseContext(context); - check(ret); - - return verification; + OpenCLInfo CL = initialize_device(in.platform_id, in.device_id); + cl_device_id device_id = CL.device_id; + cl_context context = CL.context; + cl_command_queue command_queue = CL.command_queue; + + printf("Initializing OpenCL data structures and JIT compiling kernel...\n"); + + //////////////////////////////////////////////////////////////////////////////// + // OpenCL Move Memory To Device Buffers + //////////////////////////////////////////////////////////////////////////////// + + // Create memory buffers on the device for each vector and move data over + size_t sz = SD.length_num_nucs * sizeof(int); + cl_int ret; + cl_mem num_nucs_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, num_nucs_d, CL_TRUE, 0, sz, SD.num_nucs, 0, NULL, NULL); + check(ret); + + sz = SD.length_concs * sizeof(double); + cl_mem concs_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, concs_d, CL_TRUE, 0, sz, SD.concs, 0, NULL, NULL); + check(ret); + + sz = SD.length_mats * sizeof(int); + cl_mem mats_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, mats_d, CL_TRUE, 0, sz, SD.mats, 0, NULL, NULL); + check(ret); + + sz = in.lookups * sizeof(int); + cl_mem verification_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, &ret); + check(ret); + + sz = SD.length_n_windows * sizeof(int); + cl_mem n_windows_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, n_windows_d, CL_TRUE, 0, sz, SD.n_windows, 0, NULL, NULL); + check(ret); + + sz = SD.length_poles * sizeof(Pole); + cl_mem poles_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, poles_d, CL_TRUE, 0, sz, SD.poles, 0, NULL, NULL); + check(ret); + + sz = SD.length_windows * sizeof(Window); + cl_mem windows_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, windows_d, CL_TRUE, 0, sz, SD.windows, 0, NULL, NULL); + check(ret); + + sz = SD.length_pseudo_K0RS * sizeof(double); + cl_mem pseudo_K0RS_d = clCreateBuffer(context, CL_MEM_READ_ONLY, sz, NULL, &ret); + check(ret); + ret = clEnqueueWriteBuffer(command_queue, pseudo_K0RS_d, CL_TRUE, 0, sz, SD.pseudo_K0RS, 0, NULL, NULL); + check(ret); + + //////////////////////////////////////////////////////////////////////////////// + // OpenCL Build and Intiailize Kernel Program + //////////////////////////////////////////////////////////////////////////////// + + // Create a program from the kernel source + cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); + check(ret); + + // Build the program + ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + check(ret); + + printCompilerError( program, device_id ); + + // Create the OpenCL kernel + cl_kernel kernel = clCreateKernel(program, "macro_xs_lookup_kernel", &ret); + check(ret); + + // Set the arguments of the kernel + ret = clSetKernelArg(kernel, 0, sizeof(Input), (void *)&in); + check(ret); + ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&num_nucs_d); + check(ret); + ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&mats_d); + check(ret); + ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&SD.max_num_nucs); + check(ret); + ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&concs_d); + check(ret); + ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&n_windows_d); + check(ret); + ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&pseudo_K0RS_d); + check(ret); + ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&windows_d); + check(ret); + ret = clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&poles_d); + check(ret); + ret = clSetKernelArg(kernel, 9, sizeof(int), (void *)&SD.max_num_windows); + check(ret); + ret = clSetKernelArg(kernel, 10, sizeof(int), (void *)&SD.max_num_poles); + check(ret); + ret = clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&verification_array); + check(ret); + + double stop = get_time(); + printf("OpenCL initialization time: %.3lf seconds\n", stop-start); + start = stop; + + //////////////////////////////////////////////////////////////////////////////// + // Run Simulation Kernel + //////////////////////////////////////////////////////////////////////////////// + + border_print(); + center_print("SIMULATION", 79); + border_print(); + + printf("Running event based simulation...\n"); + + // Execute the OpenCL kernel on the list + size_t global_item_size = in.lookups; // Process the entire lists + size_t local_item_size = 8; // Divide work items into groups of 8 + ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); + check(ret); + + //////////////////////////////////////////////////////////////////////////////// + // Retrieve verification data from device and reduce it + //////////////////////////////////////////////////////////////////////////////// + + // Read the memory buffer C on the device to the local variable C + ret = clEnqueueReadBuffer(command_queue, verification_array, CL_TRUE, 0, in.lookups * sizeof(int), verification_array_host, 0, NULL, NULL); + check(ret); + + printf("Reducing verification value...\n"); + + unsigned long long verification = 0; + + for( int l = 0; l < in.lookups; l++ ) + verification += verification_array_host[l]; + + stop = get_time(); + *sim_runtime = stop-start; + printf("Simulation + Verification Reduction Runtime: %.3lf seconds\n", *sim_runtime); + + //////////////////////////////////////////////////////////////////////////////// + // OpenCL cleanup + //////////////////////////////////////////////////////////////////////////////// + + ret = clFlush(command_queue); + check(ret); + ret = clFinish(command_queue); + check(ret); + ret = clReleaseKernel(kernel); + check(ret); + ret = clReleaseProgram(program); + check(ret); + ret = clReleaseMemObject(num_nucs_d); + check(ret); + ret = clReleaseMemObject(mats_d); + check(ret); + ret = clReleaseMemObject(n_windows_d); + check(ret); + ret = clReleaseMemObject(poles_d); + check(ret); + ret = clReleaseMemObject(windows_d); + check(ret); + ret = clReleaseMemObject(pseudo_K0RS_d); + check(ret); + ret = clReleaseMemObject(verification_array); + check(ret); + ret = clReleaseCommandQueue(command_queue); + check(ret); + ret = clReleaseContext(context); + check(ret); + + return verification; } double LCG_random_double(uint64_t * seed) { - const uint64_t m = 9223372036854775808ULL; // 2^63 - const uint64_t a = 2806196910506780709ULL; - const uint64_t c = 1ULL; - *seed = (a * (*seed) + c) % m; - return (double) (*seed) / (double) m; + const uint64_t m = 9223372036854775808ULL; // 2^63 + const uint64_t a = 2806196910506780709ULL; + const uint64_t c = 1ULL; + *seed = (a * (*seed) + c) % m; + return (double) (*seed) / (double) m; } uint64_t LCG_random_int(uint64_t * seed) { - const uint64_t m = 9223372036854775808ULL; // 2^63 - const uint64_t a = 2806196910506780709ULL; - const uint64_t c = 1ULL; - *seed = (a * (*seed) + c) % m; - return *seed; + const uint64_t m = 9223372036854775808ULL; // 2^63 + const uint64_t a = 2806196910506780709ULL; + const uint64_t c = 1ULL; + *seed = (a * (*seed) + c) % m; + return *seed; } RSComplex c_mul( RSComplex A, RSComplex B) { - double a = A.r; - double b = A.i; - double c = B.r; - double d = B.i; - RSComplex C; - C.r = (a*c) - (b*d); - C.i = (a*d) + (b*c); - return C; + double a = A.r; + double b = A.i; + double c = B.r; + double d = B.i; + RSComplex C; + C.r = (a*c) - (b*d); + C.i = (a*d) + (b*c); + return C; } diff --git a/openmp-offload/Makefile b/openmp-offload/Makefile index e1d90e0..1063181 100644 --- a/openmp-offload/Makefile +++ b/openmp-offload/Makefile @@ -2,7 +2,7 @@ # User Options #=============================================================================== -COMPILER = ibm +COMPILER = intel OPTIMIZE = yes DEBUG = no PROFILE = no @@ -36,6 +36,16 @@ ifeq ($(COMPILER),gnu) CFLAGS += -fopenmp -ffast-math -flto endif +# Intel Compiler +ifeq ($(COMPILER),intel) + CC = icx + CFLAGS += -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ + # Optimization Flags + ifeq ($(OPTIMIZE),yes) + CFLAGS += -O3 + endif +endif + # Clang-ykt compiler Targeting P100 ifeq ($(COMPILER),clang) CC = clang