This tutorial shows a minimal conversion from our vector addition CPU code to an OpenCL version. Consider this a OpenCL ‘Hello World’. Error handling is not included so that the structure of the code is more digestible. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov
vecAdd.c
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <CL/opencl.h> // OpenCL kernel. Each work item takes care of one element of c const char *kernelSource = "\n" \ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \ "__kernel void vecAdd( __global double *a, \n" \ " __global double *b, \n" \ " __global double *c, \n" \ " const unsigned int n) \n" \ "{ \n" \ " //Get our global thread ID \n" \ " int id = get_global_id(0); \n" \ " \n" \ " //Make sure we do not go out of bounds \n" \ " if (id < n) \n" \ " c[id] = a[id] + b[id]; \n" \ "} \n" \ "\n" ; int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
Changes
Kernel:
The kernel is the heart of our OpenCL code. The entire kernel must eventually be read in as a c string, the easiest way for a small program like this is to wrap quotes and line returns around the entire kernel. In a real program you would more than likely read the kernel in from a separate file.
// OpenCL kernel. Each work item takes care of one element of c const char *kernelSource = "\n" \ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \ "__kernel void vecAdd( __global double *a, \n" \ " __global double *b, \n" \ " __global double *c, \n" \ " const unsigned int n) \n" \ "{ \n" \ " //Get our global thread ID \n" \ " int id = get_global_id(0); \n" \ " \n" \ " //Make sure we do not go out of bounds \n" \ " if (id < n) \n" \ " c[id] = a[id] + b[id]; \n" \ "} \n" \ "\n" ;
Let’s take a look at what makes up this simple kernel.
__kernel void vecAdd( __global double *a, __global double *b, __global double *c, const unsigned int n)
The __kernel decorator specifies this is an OpenCL kernel and the __global decorator specifies the pointer is referring to the global device memory space, otherwise normal C function syntax is used. The kernel must have return type void.
int id = get_global_id(0);
Here we grab the global work item id for dimension 0.
if (id < n) c[id] = a[id] + b[id];
There must be an integer number of work groups, or put in another way the number of work items in each work group must be a devisor of the global number of work items. Since the work group size is used to tune performance and will not necessarily be a devisor of the total number of threads needed it is common to be forced to launch more threads than are needed and ignore the extras. After we check that we are inside of the problem domain we can access and manipulate the device memory.
Memory:
// Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c;
With the host CPU and GPU having separate memory spaces we must maintain two separate references to our memory, one set for our host array pointers and one set for our device array memory handles. Here we use the h_ and d_ prefix to differentiate them.
Thread Mapping:
// Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize;
To map our problem onto the underlying hardware we must specify a local and global integer size. The local size defines the number of work items in a work group, on an NVIDIA GPU this is equivalent to the number of threads in a thread block. The global size is the total number of work items launched. the localSize must be a devisor of globalSize and so we calculate the smallest integer that covers our problem domain and is divisible by localSize.
OpenCL Setup:
// Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL);
Each hardware vendor will have a different platform which must be bound to before use, here clGetPlatformIDs will set cpPlatform to contain the systems available platforms. For example if a system contains an AMD CPU and an NVIDIA GPU with the appropriate OpenCL drivers installed two OpenCL platforms will be available.
// Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
The platform can be queried to find what specific devices it contains. In this case we query the platform for GPU devices by using the CL_DEVICE_TYPE_GPU enumerated value.
// Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
Before using an OpenCL device a context must be set up. The context will be used to manage command queues, memory, and kernel activity. A context can contain more than one device.
// Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err);
The command queue is used to stream commands from the host to the specified device. Memory transfers and kernel activity can pushed to the command queue where they will be executed on the device when possible.
Compile Kernel:
// Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err);
To ensure that OpenCL code is portable to many devices the default way to run kernels is with just-in-time, or JIT, compilation. We must prepare the source code for the device(s) in a given context. First we create our program, which is a set of kernel code, and then from that program we create the individual kernels.
Prepare data on device:
// Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
Before launching our kernel we must create buffers between the host and device, bind the host data to those newly created device buffers, and finally set the kernel arguments.
Launch Kernel:
// Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
Once all of the memory resides on the device the kernel can be enqueued to be launched.
Copy results to host:
// Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL );
We can block until the command queue is cleared and then read back the device results to the host.
Compiling
$ module load cudatoolkit $ cc -lOpenCL vecAdd.c -o vecAdd.out
Running
$ aprun ./vecAdd.out final result: 1.000000
vecAdd.cc
C++ bindings are commonly used for OpenCL development and offers an interface that is somewhat more streamlined than the standard C interface. An example of the vectorAddition code written using these bindings is given below.
#define __CL_ENABLE_EXCEPTIONS #include "cl.hpp" #include <cstdio> #include <cstdlib> #include <iostream> #include <math.h> // OpenCL kernel. Each work item takes care of one element of c const char *kernelSource = "\n" \ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \ "__kernel void vecAdd( __global double *a, \n" \ " __global double *b, \n" \ " __global double *c, \n" \ " const unsigned int n) \n" \ "{ \n" \ " //Get our global thread ID \n" \ " int id = get_global_id(0); \n" \ " \n" \ " //Make sure we do not go out of bounds \n" \ " if (id < n) \n" \ " c[id] = a[id] + b[id]; \n" \ "} \n" \ "\n" ; int main(int argc, char *argv[]) { // Length of vectors unsigned int n = 1000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl::Buffer d_a; cl::Buffer d_b; // Device output buffer cl::Buffer d_c; // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = new double[n]; h_b = new double[n]; h_c = new double[n]; // Initialize vectors on host for(int i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } cl_int err = CL_SUCCESS; try { // Query platforms std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); if (platforms.size() == 0) { std::cout << "Platform size 0\n"; return -1; } // Get list of devices on default platform and create context cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_GPU, properties); std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); // Create command queue for first device cl::CommandQueue queue(context, devices[0], 0, &err); // Create device memory buffers d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes); d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes); // Bind memory buffers queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a); queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b); //Build kernel from source string cl::Program::Sources source(1, std::make_pair(kernelSource,strlen(kernelSource))); cl::Program program_ = cl::Program(context, source); program_.build(devices); // Create kernel object cl::Kernel kernel(program_, "vecAdd", &err); // Bind kernel arguments to kernel kernel.setArg(0, d_a); kernel.setArg(1, d_b); kernel.setArg(2, d_c); kernel.setArg(3, n); // Number of work items in each local work group cl::NDRange localSize(64); // Number of total work items - localSize must be devisor cl::NDRange globalSize((int)(ceil(n/(float)64)*64)); // Enqueue kernel cl::Event event; queue.enqueueNDRangeKernel( kernel, cl::NullRange, globalSize, localSize, NULL, &event); // Block until kernel completion event.wait(); // Read back d_c queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c); } catch (cl::Error err) { std::cerr << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl; } // Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(int i=0; i<n; i++) sum += h_c[i]; std::cout<<"final result: "<<sum/n<<std::endl; // Release host memory delete(h_a); delete(h_b); delete(h_c); return 0; }
Compiling and Running, C++
To compile you will first need to download the OpenCL C++ header file cl.hpp.
$ module load cudatoolkit $ CC vecAdd.cc -lOpenCL -o vecAdd.out
$ aprun ./vecAdd.out final result: 1.000000