This tutorial shows a minimal conversion from our vector addition CPU code to an OpenCL version. Consider this a OpenCL ‘Hello World’. Error handling is not included so that the structure of the code is more digestible. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

vecAdd.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <CL/opencl.h>

// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                       "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
"__kernel void vecAdd(  __global double *a,                       \n" \
"                       __global double *b,                       \n" \
"                       __global double *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;

int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;

    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;

    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;

    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);

    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);

    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }

    size_t globalSize, localSize;
    cl_int err;

    // Number of work items in each local work group
    localSize = 64;

    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;

    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);

    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

    // Create a context  
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

    // Create a command queue 
    queue = clCreateCommandQueue(context, device_id, 0, &err);

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);

    // Build the program executable 
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);

    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);

    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

    // Execute the kernel over the entire range of the data set  
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);

    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);

    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );

    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);

    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

Changes

Kernel:
The kernel is the heart of our OpenCL code. The entire kernel must eventually be read in as a c string, the easiest way for a small program like this is to wrap quotes and line returns around the entire kernel. In a real program you would more than likely read the kernel in from a separate file.

// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                      "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
"__kernel void vecAdd(  __global double *a,                       \n" \
"                       __global double *b,                       \n" \
"                       __global double *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;

Let’s take a look at what makes up this simple kernel.

__kernel void vecAdd(  __global double *a, __global double *b,
                       __global double *c, const unsigned int n)

The __kernel decorator specifies this is an OpenCL kernel and the __global decorator specifies the pointer is referring to the global device memory space, otherwise normal C function syntax is used. The kernel must have return type void.

int id = get_global_id(0); 

Here we grab the global work item id for dimension 0.

if (id < n)
    c[id] = a[id] + b[id];

There must be an integer number of work groups, or put in another way the number of work items in each work group must be a devisor of the global number of work items. Since the work group size is used to tune performance and will not necessarily be a devisor of the total number of threads needed it is common to be forced to launch more threads than are needed and ignore the extras. After we check that we are inside of the problem domain we can access and manipulate the device memory.

Memory:

// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
 
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;

With the host CPU and GPU having separate memory spaces we must maintain two separate references to our memory, one set for our host array pointers and one set for our device array memory handles. Here we use the h_ and d_ prefix to differentiate them.

Thread Mapping:

// Number of work items in each local work group
localSize = 64;
 
// Number of total work items - localSize must be devisor
globalSize = ceil(n/(float)localSize)*localSize;

To map our problem onto the underlying hardware we must specify a local and global integer size. The local size defines the number of work items in a work group, on an NVIDIA GPU this is equivalent to the number of threads in a thread block. The global size is the total number of work items launched. the localSize must be a devisor of globalSize and so we calculate the smallest integer that covers our problem domain and is divisible by localSize.

OpenCL Setup:

// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);

Each hardware vendor will have a different platform which must be bound to before use, here clGetPlatformIDs will set cpPlatform to contain the systems available platforms. For example if a system contains an AMD CPU and an NVIDIA GPU with the appropriate OpenCL drivers installed two OpenCL platforms will be available.

// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

The platform can be queried to find what specific devices it contains. In this case we query the platform for GPU devices by using the CL_DEVICE_TYPE_GPU enumerated value.

// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

Before using an OpenCL device a context must be set up. The context will be used to manage command queues, memory, and kernel activity. A context can contain more than one device.

// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);

The command queue is used to stream commands from the host to the specified device. Memory transfers and kernel activity can pushed to the command queue where they will be executed on the device when possible.

Compile Kernel:

// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1,
                        (const char **) & kernelSource, NULL, &err);
 
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);

To ensure that OpenCL code is portable to many devices the default way to run kernels is with just-in-time, or JIT, compilation. We must prepare the source code for the device(s) in a given context. First we create our program, which is a set of kernel code, and then from that program we create the individual kernels.

Prepare data on device:

// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                               bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                               bytes, h_b, 0, NULL, NULL);
 
// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

Before launching our kernel we must create buffers between the host and device, bind the host data to those newly created device buffers, and finally set the kernel arguments.

Launch Kernel:

// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                          0, NULL, NULL);

Once all of the memory resides on the device the kernel can be enqueued to be launched.

Copy results to host:

// Wait for the command queue to get serviced before reading back results
clFinish(queue);
 
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                            bytes, h_c, 0, NULL, NULL );

We can block until the command queue is cleared and then read back the device results to the host.

Compiling

$ module load cudatoolkit
$ cc -lOpenCL vecAdd.c -o vecAdd.out

Running

$ aprun ./vecAdd.out
final result: 1.000000

vecAdd.cc

C++ bindings are commonly used for OpenCL development and offers an interface that is somewhat more streamlined than the standard C interface. An example of the vectorAddition code written using these bindings is given below.

#define __CL_ENABLE_EXCEPTIONS

#include "cl.hpp"
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <math.h>

// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                      "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
"__kernel void vecAdd(  __global double *a,                       \n" \
"                       __global double *b,                       \n" \
"                       __global double *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;


int main(int argc, char *argv[])
{

    // Length of vectors
    unsigned int n = 1000;

    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;

    // Device input buffers
    cl::Buffer d_a;
    cl::Buffer d_b;
    // Device output buffer
    cl::Buffer d_c;

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);

    // Allocate memory for each vector on host
    h_a = new double[n];
    h_b = new double[n];
    h_c = new double[n];

    // Initialize vectors on host
    for(int i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }

    cl_int err = CL_SUCCESS;
    try {

        // Query platforms
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        if (platforms.size() == 0) {
            std::cout << "Platform size 0\n";
            return -1;
         }

        // Get list of devices on default platform and create context
        cl_context_properties properties[] =
           { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
        cl::Context context(CL_DEVICE_TYPE_GPU, properties);
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

        // Create command queue for first device
        cl::CommandQueue queue(context, devices[0], 0, &err);

        // Create device memory buffers
        d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);

        // Bind memory buffers
        queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
        queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);

        //Build kernel from source string
        cl::Program::Sources source(1,
            std::make_pair(kernelSource,strlen(kernelSource)));
        cl::Program program_ = cl::Program(context, source);
        program_.build(devices);

        // Create kernel object
        cl::Kernel kernel(program_, "vecAdd", &err);

        // Bind kernel arguments to kernel
        kernel.setArg(0, d_a);
        kernel.setArg(1, d_b);
        kernel.setArg(2, d_c);
        kernel.setArg(3, n);

        // Number of work items in each local work group
        cl::NDRange localSize(64);
        // Number of total work items - localSize must be devisor
        cl::NDRange globalSize((int)(ceil(n/(float)64)*64));

        // Enqueue kernel
        cl::Event event;
        queue.enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            globalSize,
            localSize,
            NULL,
            &event);

        // Block until kernel completion
        event.wait();

        // Read back d_c
        queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
        }
    catch (cl::Error err) {
         std::cerr
            << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;
    }

    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(int i=0; i<n; i++)
        sum += h_c[i];
    std::cout<<"final result: "<<sum/n<<std::endl;

    // Release host memory
    delete(h_a);
    delete(h_b);
    delete(h_c);

    return 0;
}

Compiling and Running, C++

To compile you will first need to download the OpenCL C++ header file cl.hpp.

$ module load cudatoolkit
$ CC vecAdd.cc -lOpenCL -o vecAdd.out
$ aprun ./vecAdd.out
final result: 1.000000

In this tutorial we will cover two different OpenCL versions of our Game of Life program. The first will strictly use global memory and be a straightforward GPU port of our program, the second we will introduce local(shared) memory. Each program will be separated into two files, one containing C code and another containing the device kernels. Basic OpenCL concepts covered in the vector addition sample will not be covered in detail here for brevity. A description of the Game of Life and the original CPU code can be found here. The full source can be viewed or downloaded at the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

GOL.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <CL/opencl.h>
#include <sys/stat.h>

#define SRAND_VALUE 1985
#define LOCAL_SIZE 16

int main(int argc, char* argv[])
{
    int i,j,iter;
    int *h_grid;
    cl_mem d_grid;
    cl_mem d_newGrid;
    cl_mem d_tmpGrid;

    // Linear game grid dimension
    int dim = 1024;
    // Number of game iterations
    int maxIter = 1<<10;

    // Size, in bytes, of each vector
    size_t bytes = sizeof(int)*(dim+2)*(dim+2);

    // Allocate host Grid used for initial setup and read back from device
    h_grid = (int*)malloc(bytes);

    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    //Kernels
    cl_kernel k_gol, k_ghostRows, k_ghostCols;

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }

   cl_int err;

    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
    if (err != CL_SUCCESS) {
      printf( "Error: Failed to find a platform\n");
      return EXIT_FAILURE;
    }

    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to create a device group\n");
        return EXIT_FAILURE;
    }

    // Create a context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context) {
      printf("Error: Failed to create a compute context\n");
      return EXIT_FAILURE;
    }

    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
    if (!queue) {
      printf("Error: Failed to create a command commands\n");
      return EXIT_FAILURE;
    }

    // Create the compute program from the kernel source file
    char *fileName = "GOL-kernels.cl";
    FILE *fh = fopen(fileName, "r");
    if(!fh) {
        printf("Error: Failed to open file\n");
        return 0;
    }
    struct stat statbuf;
    stat(fileName, &statbuf);
    char *kernelSource = (char *) malloc(statbuf.st_size + 1);
    fread(kernelSource, statbuf.st_size, 1, fh);
    kernelSource[statbuf.st_size] = '\0';
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
    if (!program) {
      printf("Error: Failed to create compute program\n");
      return EXIT_FAILURE;
    }

    // Build the program executable
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to build program executable %d\n", err);
      return EXIT_FAILURE;
    }

    // Create the GOL kernel in the program we wish to run
    k_gol = clCreateKernel(program, "GOL", &err);
    if (!k_gol || err != CL_SUCCESS) {
      printf("Error: Failed to create GOL kernel \n");
      return EXIT_FAILURE;
    }

    // Create the ghostRows kernel in the program we wish to run
    k_ghostRows = clCreateKernel(program, "ghostRows", &err);
    if (!k_ghostRows || err != CL_SUCCESS) {
      printf("Error: Failed to create ghostRows kernel\n");
      return EXIT_FAILURE;
    }

    // Create the ghostCols kernel in the program we wish to run
    k_ghostCols = clCreateKernel(program, "ghostCols", &err);
    if (!k_ghostCols || err != CL_SUCCESS) {
      printf("Error: Failed to create ghostCols kernel\n");
      return EXIT_FAILURE;
    }

    // Create the input and output arrays in device memory for our calculation
    d_grid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL);
    d_newGrid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL);
    if (!d_grid || !d_newGrid) {
      printf("Error: Failed to allocate device memory\n");
      return EXIT_FAILURE;
    }

    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_grid, CL_TRUE, 0,
                                   bytes, h_grid, 0, NULL, NULL);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to write to source array\n");
      return EXIT_FAILURE;
    }

    // Set the arguments to GOL  kernel
    err  = clSetKernelArg(k_gol, 0, sizeof(int), &dim);
    err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid);
    err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to set kernel arguments\n");
      return EXIT_FAILURE;
    }

    // Set the arguments to ghostRows kernel
    err  = clSetKernelArg(k_ghostRows, 0, sizeof(int), &dim);
    err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to set kernel arguments\n");
      return EXIT_FAILURE;
    }

    // Set the arguments to ghostCols kernel
    err  = clSetKernelArg(k_ghostCols, 0, sizeof(int), &dim);
    err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to set kernel arguments\n");
      return EXIT_FAILURE;
    }

    // Set kernel local and global sizes
    size_t cpyRowsGlobalSize, cpyColsGlobalSize, cpyLocalSize;
    cpyLocalSize = LOCAL_SIZE;
    // Number of total work items - localSize must be devisor
    cpyRowsGlobalSize = (size_t)ceil(dim/(float)cpyLocalSize)*cpyLocalSize;
    cpyColsGlobalSize = (size_t)ceil((dim+2)/(float)cpyLocalSize)*cpyLocalSize;

    size_t GolLocalSize[2] = {LOCAL_SIZE, LOCAL_SIZE};
    size_t linGlobal = (size_t)ceil(dim/(float)LOCAL_SIZE)*LOCAL_SIZE;
    size_t GolGlobalSize[2] = {linGlobal, linGlobal};

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {
        err = clEnqueueNDRangeKernel(queue, k_ghostRows, 1, NULL, &cpyRowsGlobalSize, &cpyLocalSize,
                                                              0, NULL, NULL);
        err |= clEnqueueNDRangeKernel(queue, k_ghostCols, 1, NULL, &cpyColsGlobalSize, &cpyLocalSize,
                                                              0, NULL, NULL);
        err |= clEnqueueNDRangeKernel(queue, k_gol, 2, NULL, GolGlobalSize, GolLocalSize,
                                                              0, NULL, NULL);

        if(iter%2 == 1) {
            err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid);
            err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid);
            err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid);
            err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid);
        } else {
            err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_newGrid);
            err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_newGrid);
            err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_newGrid);
            err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_grid);
        }
    }// End main game loop

    if (err != CL_SUCCESS) {
       printf("Error: Failed to launch kernels%d\n",err);
       return EXIT_FAILURE;
    }

    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);

    // Read the results from the device
    err =  clEnqueueReadBuffer(queue, d_grid, CL_TRUE, 0,
                        bytes, h_grid, 0, NULL, NULL );
    if (err != CL_SUCCESS) {
      printf("Error: Failed to read output array\n");
      return EXIT_FAILURE;;
    }

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total);

    // Release memory
    free(h_grid);

    return 0;
}

Changes

// Create the compute program from the kernel source file
char *fileName = "GOL-kernels.cl";
FILE *fh = fopen(fileName, "r");
if(!fh) {
    printf("Error: Failed to open file\n");
    return 0;
}
struct stat statbuf;
stat(fileName, &statbuf);
char *kernelSource = (char *) malloc(statbuf.st_size + 1);
fread(kernelSource, statbuf.st_size, 1, fh);
kernelSource[statbuf.st_size] = '\0';
program = clCreateProgramWithSource(context, 1,
                                    (const char **) & kernelSource, NULL, &err);
if (!program) {
    printf("Error: Failed to create compute program\n");
    return EXIT_FAILURE;
}

For more complicated kernels it is not reasonable to enter them as a char string in our c source file. Here we will read in the contents of GOL-kernels.cl which as we will see contains 3 OpenCL kernels.

// Set kernel local and global sizes
size_t cpyRowsGlobalSize, cpyColsGlobalSize, cpyLocalSize;
cpyLocalSize = LOCAL_SIZE;
// Number of total work items - localSize must be devisor
cpyRowsGlobalSize = (size_t)ceil(dim/(float)cpyLocalSize)*cpyLocalSize;
cpyColsGlobalSize = (size_t)ceil((dim+2)/(float)cpyLocalSize)*cpyLocalSize;

The copy kernels use a linear work group size. The cpyLocalSize must be a devisor of the cpy*GlobalSize and so we calculate the smallest integer that covers our problem domain and is divisible by cpyLocalSize.

size_t GolLocalSize[2] = {LOCAL_SIZE, LOCAL_SIZE};
size_t linGlobal = (size_t)ceil(dim/(float)LOCAL_SIZE)*LOCAL_SIZE;
size_t GolGlobalSize[2] = {linGlobal, linGlobal};

For the GOL kernel we specify a two dimensional work group size to better suite the problems geometry. The necessary global size is then be calculated.

// Main game loop
for (iter = 0; iter<maxIter; iter++) {
    err = clEnqueueNDRangeKernel(queue, k_ghostRows, 1, NULL, &cpyRowsGlobalSize, &cpyLocalSize,
                                 0, NULL, NULL);
    err |= clEnqueueNDRangeKernel(queue, k_ghostCols, 1, NULL, &cpyColsGlobalSize, &cpyLocalSize,
                                  0, NULL, NULL);
    err |= clEnqueueNDRangeKernel(queue, k_gol, 2, NULL, GolGlobalSize, GolLocalSize,
                                  0, NULL, NULL);
    
    if(iter%2 == 1) {
        err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid);
        err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid);
        err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid);
        err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid);
    } else {
        err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_newGrid);
        err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_newGrid);
        err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_newGrid);
        err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_grid);
    }
}// End main game loop

OpenCL handles memory in terms of buffers, not pointers, so we are unable to preform a simple pointer swap. The process is relatively straight forward however as the argument order can be switched with a simple conditional statement inside of the main game loop.

GOL-kernels.cl

__kernel void ghostRows(const int dim,
                        __global *grid)
{
    // We want id to range from 1 to dim
    int id = get_global_id(0) + 1;

    if (id <= dim)
    {
        grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id]; //Copy first real row to bottom ghost row
        grid[id] = grid[(dim+2)*dim + id]; //Copy last real row to top ghost row
    }
}

__kernel void ghostCols(const int dim,
                        __global *grid)
{
    // We want id to range from 0 to dim+1
    int id = get_global_id(0);

    if (id <= dim+1)
    {
        grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1]; //Copy first real column to right most ghost column
        grid[id*(dim+2)] = grid[id*(dim+2) + dim]; //Copy last real column to left most ghost column
    }
}

__kernel void GOL(const int dim,
                  __global int *grid,
                  __global int *newGrid)
{
    int ix = get_global_id(0) + 1;
    int iy = get_global_id(1) + 1;
    int id = iy * (dim+2) + ix;

    int numNeighbors;

    if (iy <= dim && ix <= dim) {

    // Get the number of neighbors for a given grid point
    numNeighbors = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower
                 + grid[id+1] + grid[id-1]             //right left
                 + grid[id+(dim+3)] + grid[id-(dim+3)] //diagonals
                 + grid[id-(dim+1)] + grid[id+(dim+1)];

    int cell = grid[id];
    // Here we have explicitly all of the game rules
    if (cell == 1 && numNeighbors < 2)
        newGrid[id] = 0;
    else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
        newGrid[id] = 1;
    else if (cell == 1 && numNeighbors > 3)
        newGrid[id] = 0;
    else if (cell == 0 && numNeighbors == 3)
        newGrid[id] = 1;
    else
        newGrid[id] = cell;
    }
}

GOL-Local.c

In the first example each work item read nine values from global memory and wrote back one value. The inefficiency of this approach is that each value that is read from global memory is used not only in calculating its own new value but also by its eight closest neighbors in calculating their new value. Although some global memory access penalties have been alleviated in the latest generation of cards with the addition of an L2 cache you will find local memory is still an incredibly important aspect of GPU programming. Local memory provides a very fast memory space(comparable to register access) that is shared between work items of the same work-group. In the case where multiple work items in a particular work-group are accessing the same global memory item local memory may be used to reduce the number of global memory calls. A typical use for local memory is as follows:

  1. Have each work item read from global memory into local memory
  2. Synchronize work items to ensure all items in a given work-group have finished reading from global memory
  3. Have each work item preform calculations using local memory values
  4. Have each work item write result from local memory back to device global memory
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <CL/opencl.h>
#include <sys/stat.h>

#define SRAND_VALUE 1985
#define LOCAL_SIZE 16

int main(int argc, char* argv[])
{
    int i,j,iter;
    int *h_grid;
    cl_mem d_grid;
    cl_mem d_newGrid;
    cl_mem d_tmpGrid;

    // Linear game grid dimension
    int dim = 1024;
    // Number of game iterations
    int maxIter = 1<<10;

    // Size, in bytes, of each vector
    size_t bytes = sizeof(int)*(dim+2)*(dim+2);

    // Allocate host Grid used for initial setup and read back from device
    h_grid = (int*)malloc(bytes);

    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    //Kernels
    cl_kernel k_gol, k_ghostRows, k_ghostCols;

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }

   cl_int err;

    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
    if (err != CL_SUCCESS) {
      printf( "Error: Failed to find a platform\n");
      return EXIT_FAILURE;
    }

    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to create a device group\n");
        return EXIT_FAILURE;
    }

    // Create a context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context) {
      printf("Error: Failed to create a compute context\n");
      return EXIT_FAILURE;
    }

    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);
    if (!queue) {
      printf("Error: Failed to create a command commands\n");
      return EXIT_FAILURE;
    }

    // Create the compute program from the kernel source file
    char *fileName = "GOL-kernels.cl";
    FILE *fh = fopen(fileName, "r");
    if(!fh) {
        printf("Error: Failed to open file\n");
        return 0;
    }
    struct stat statbuf;
    stat(fileName, &statbuf);
    char *kernelSource = (char *) malloc(statbuf.st_size + 1);
    fread(kernelSource, statbuf.st_size, 1, fh);
    kernelSource[statbuf.st_size] = '\0';
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
    if (!program) {
      printf("Error: Failed to create compute program\n");
      return EXIT_FAILURE;
    }

    // Build the program executable
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to build program executable %d\n", err);
      return EXIT_FAILURE;
    }

    // Create the GOL kernel in the program we wish to run
    k_gol = clCreateKernel(program, "GOL", &err);
    if (!k_gol || err != CL_SUCCESS) {
      printf("Error: Failed to create GOL kernel\n");
      return EXIT_FAILURE;
    }

    // Create the ghostRows kernel in the program we wish to run
    k_ghostRows = clCreateKernel(program, "ghostRows", &err);
    if (!k_ghostRows || err != CL_SUCCESS) {
      printf("Error: Failed to create ghostRows kernel\n");
      return EXIT_FAILURE;
    }

    // Create the ghostCols kernel in the program we wish to run
    k_ghostCols = clCreateKernel(program, "ghostCols", &err);
    if (!k_ghostCols || err != CL_SUCCESS) {
      printf("Error: Failed to create ghostCols kernel\n");
      return EXIT_FAILURE;
    }

    // Create the input and output arrays in device memory for our calculation
    d_grid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL);
    d_newGrid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL);
    if (!d_grid || !d_newGrid) {
      printf("Error: Failed to allocate device memory\n");
      return EXIT_FAILURE;
    }

    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_grid, CL_TRUE, 0,
                                   bytes, h_grid, 0, NULL, NULL);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to write to source array\n");
      return EXIT_FAILURE;
    }

    // Set the arguments to GOL  kernel
    err  = clSetKernelArg(k_gol, 0, sizeof(int), &dim);
    err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid);
    err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to set kernel arguments\n");
      return EXIT_FAILURE;
    }

    // Set the arguments to ghostRows kernel
    err  = clSetKernelArg(k_ghostRows, 0, sizeof(int), &dim);
    err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to set kernel arguments\n");
      return EXIT_FAILURE;
    }

    // Set the arguments to ghostCols kernel
    err  = clSetKernelArg(k_ghostCols, 0, sizeof(int), &dim);
    err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid);
    if (err != CL_SUCCESS) {
      printf("Error: Failed to set kernel arguments\n");
      return EXIT_FAILURE;
    }

    // Set kernel local and global sizes
    size_t cpyRowsGlobalSize, cpyColsGlobalSize, cpyLocalSize;
    cpyLocalSize = LOCAL_SIZE;
    // Number of total work items - localSize must be devisor
    cpyRowsGlobalSize = (size_t)ceil(dim/(float)cpyLocalSize)*cpyLocalSize;
    cpyColsGlobalSize = (size_t)ceil((dim+2)/(float)cpyLocalSize)*cpyLocalSize;

    size_t GolLocalSize[2] = {LOCAL_SIZE, LOCAL_SIZE};
    size_t linGlobal = (size_t)ceil(ceil(dim/(float)(LOCAL_SIZE-2))*(LOCAL_SIZE-2)/LOCAL_SIZE)*LOCAL_SIZE;
    size_t GolGlobalSize[2] = {linGlobal, linGlobal};

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {
        err = clEnqueueNDRangeKernel(queue, k_ghostRows, 1, NULL, &cpyRowsGlobalSize, &cpyLocalSize,
                                                              0, NULL, NULL);
        err |= clEnqueueNDRangeKernel(queue, k_ghostCols, 1, NULL, &cpyColsGlobalSize, &cpyLocalSize,
                                                              0, NULL, NULL);
        err |= clEnqueueNDRangeKernel(queue, k_gol, 2, NULL, GolGlobalSize, GolLocalSize,
                                                              0, NULL, NULL);

        if(iter%2 == 1) {
            err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid);
            err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid);
            err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid);
            err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid);
        } else {
            err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_newGrid);
            err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_newGrid);
            err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_newGrid);
            err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_grid);
        }
    }// End main game loop

    if (err != CL_SUCCESS) {
       printf("Error: Failed to launch kernels%d\n",err);
       return EXIT_FAILURE;
    }

    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);

    // Read the results from the device
    err =  clEnqueueReadBuffer(queue, d_grid, CL_TRUE, 0,
                        bytes, h_grid, 0, NULL, NULL );
    if (err != CL_SUCCESS) {
      printf("Error: Failed to read output array\n");
      return EXIT_FAILURE;;
    }

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total);

    // Release memory
    free(h_grid);

    return 0;
}

GOL-local-kernels.cl

#define LOCAL_SIZE_x 16
#define LOCAL_SIZE_y 16

__kernel void ghostRows(const int dim,
                        __global int *grid)
{
    // We want id to range from 1 to dim
    int id = get_global_id(0) + 1;

    if (id <= dim)
    {
        grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id]; //Copy first real row to bottom ghost row
        grid[id] = grid[(dim+2)*dim + id]; //Copy last real row to top ghost row
    }
}

__kernel void ghostCols(const int dim,
                        __global int *grid)
{
    // We want id to range from 0 to dim+1
    int id = get_global_id(0);

    if (id <= dim+1)
    {
        grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1]; //Copy first real column to right most ghost column
        grid[id*(dim+2)] = grid[id*(dim+2) + dim]; //Copy last real column to left most ghost column
    }
}

__kernel void GOL(const int dim,
                  __global int *grid,
                  __global int *newGrid)
{
    int ix = (get_local_size(0)-2) * get_group_id(0) + get_local_id(0);
    int iy = (get_local_size(1)-2) * get_group_id(1) + get_local_id(1);
    int id = iy * (dim+2) + ix;

    int i = get_local_id(0);
    int j = get_local_id(1);
    int numNeighbors;

    // Declare the local memory on a per work group level
    __local int s_grid[LOCAL_SIZE_y][LOCAL_SIZE_x];

    // Copy cells into local memory
    if (ix <= dim+1 && iy <= dim+1)
        s_grid[i][j] = grid[id];

    //Sync all work items in work group
    barrier(CLK_LOCAL_MEM_FENCE);

    if (iy <= dim && ix <= dim) {
        if(i != 0 && i !=LOCAL_SIZE_y-1 && j != 0 && j !=LOCAL_SIZE_x-1) {

            // Get the number of neighbors for a given grid point
            numNeighbors = s_grid[i+1][j] + s_grid[i-1][j] //upper lower
                         + s_grid[i][j+1] + s_grid[i][j-1] //right left
                         + s_grid[i+1][j+1] + s_grid[i-1][j-1] //diagonals
                         + s_grid[i-1][j+1] + s_grid[i+1][j-1];

             int cell = s_grid[i][j];

             // Here we have explicitly all of the game rules
             if (cell == 1 && numNeighbors < 2)
                 newGrid[id] = 0;
             else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
                 newGrid[id] = 1;
             else if (cell == 1 && numNeighbors > 3)
                 newGrid[id] = 0;
             else if (cell == 0 && numNeighbors == 3)
                 newGrid[id] = 1;
             else
                 newGrid[id] = cell;
        }
    }
}

Changes to GOL-local-kernels.cl

    // Declare the local memory on a per work group level
    __local int s_grid[LOCAL_SIZE_y][LOCAL_SIZE_x];

The __local qualifier is used in the device kernel to allocate local memory arrays. The specified size is on a per work-group basis.

// Copy cells into local memory
if (ix <= dim+1 && iy <= dim+1)
    s_grid[i][j] = grid[id];

Each work item that is within the bounds of our work group size, including ghost cells, will read one value from global memory into local memory.

//Sync all threads in work group
barrier(CLK_LOCAL_MEM_FENCE);

Before we use any local memory we need to ensure that each work item in the work-group has finished reading its value from global memory into local memory by using a barrier, in this case the barrier is a work-group level barrier. When all work-items in a given work-group have reached the barrier execution will commence.

if (iy <= dim && ix <= dim) {
    if(i != 0 && i !=LOCAL_SIZE_y-1 && j != 0 && j !=LOCAL_SIZE_x-1) {

Before calculating our new values we must check that our global work item id’s, iy and ix, are do not extend into, or past, the ghost cells. Local work item id’s, i and j, must be checked so that no calculations are preformed in the per work-group ghost cells.

            // Get the number of neighbors for a given grid point
            numNeighbors = s_grid[i+1][j] + s_grid[i-1][j] //upper lower
                         + s_grid[i][j+1] + s_grid[i][j-1] //right left
                         + s_grid[i+1][j+1] + s_grid[i-1][j-1] //diagonals
                         + s_grid[i-1][j+1] + s_grid[i+1][j-1];
 
             int cell = s_grid[i][j];

The standard GOL calculations are preformed only now we are reading all cell values from local memory instead of global memory.