In this tutorial we will cover two different OpenCL versions of our Game of Life program. The first will strictly use global memory and be a straightforward GPU port of our program, the second we will introduce local(shared) memory. Each program will be separated into two files, one containing C code and another containing the device kernels. Basic OpenCL concepts covered in the vector addition sample will not be covered in detail here for brevity. A description of the Game of Life and the original CPU code can be found here. The full source can be viewed or downloaded at the OLCF GitHub. Please direct any questions or comments to help@nccs.gov
GOL.c
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <CL/opencl.h> #include <sys/stat.h> #define SRAND_VALUE 1985 #define LOCAL_SIZE 16 int main(int argc, char* argv[]) { int i,j,iter; int *h_grid; cl_mem d_grid; cl_mem d_newGrid; cl_mem d_tmpGrid; // Linear game grid dimension int dim = 1024; // Number of game iterations int maxIter = 1<<10; // Size, in bytes, of each vector size_t bytes = sizeof(int)*(dim+2)*(dim+2); // Allocate host Grid used for initial setup and read back from device h_grid = (int*)malloc(bytes); cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program //Kernels cl_kernel k_gol, k_ghostRows, k_ghostCols; // Assign initial population randomly srand(SRAND_VALUE); for(i = 1; i<=dim; i++) { for(j = 1; j<=dim; j++) { h_grid[i*(dim+2)+j] = rand() % 2; } } cl_int err; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); if (err != CL_SUCCESS) { printf( "Error: Failed to find a platform\n"); return EXIT_FAILURE; } // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group\n"); return EXIT_FAILURE; } // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context\n"); return EXIT_FAILURE; } // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); if (!queue) { printf("Error: Failed to create a command commands\n"); return EXIT_FAILURE; } // Create the compute program from the kernel source file char *fileName = "GOL-kernels.cl"; FILE *fh = fopen(fileName, "r"); if(!fh) { printf("Error: Failed to open file\n"); return 0; } struct stat statbuf; stat(fileName, &statbuf); char *kernelSource = (char *) malloc(statbuf.st_size + 1); fread(kernelSource, statbuf.st_size, 1, fh); kernelSource[statbuf.st_size] = '\0'; program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to build program executable %d\n", err); return EXIT_FAILURE; } // Create the GOL kernel in the program we wish to run k_gol = clCreateKernel(program, "GOL", &err); if (!k_gol || err != CL_SUCCESS) { printf("Error: Failed to create GOL kernel \n"); return EXIT_FAILURE; } // Create the ghostRows kernel in the program we wish to run k_ghostRows = clCreateKernel(program, "ghostRows", &err); if (!k_ghostRows || err != CL_SUCCESS) { printf("Error: Failed to create ghostRows kernel\n"); return EXIT_FAILURE; } // Create the ghostCols kernel in the program we wish to run k_ghostCols = clCreateKernel(program, "ghostCols", &err); if (!k_ghostCols || err != CL_SUCCESS) { printf("Error: Failed to create ghostCols kernel\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation d_grid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL); d_newGrid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL); if (!d_grid || !d_newGrid) { printf("Error: Failed to allocate device memory\n"); return EXIT_FAILURE; } // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_grid, CL_TRUE, 0, bytes, h_grid, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array\n"); return EXIT_FAILURE; } // Set the arguments to GOL kernel err = clSetKernelArg(k_gol, 0, sizeof(int), &dim); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments\n"); return EXIT_FAILURE; } // Set the arguments to ghostRows kernel err = clSetKernelArg(k_ghostRows, 0, sizeof(int), &dim); err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments\n"); return EXIT_FAILURE; } // Set the arguments to ghostCols kernel err = clSetKernelArg(k_ghostCols, 0, sizeof(int), &dim); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments\n"); return EXIT_FAILURE; } // Set kernel local and global sizes size_t cpyRowsGlobalSize, cpyColsGlobalSize, cpyLocalSize; cpyLocalSize = LOCAL_SIZE; // Number of total work items - localSize must be devisor cpyRowsGlobalSize = (size_t)ceil(dim/(float)cpyLocalSize)*cpyLocalSize; cpyColsGlobalSize = (size_t)ceil((dim+2)/(float)cpyLocalSize)*cpyLocalSize; size_t GolLocalSize[2] = {LOCAL_SIZE, LOCAL_SIZE}; size_t linGlobal = (size_t)ceil(dim/(float)LOCAL_SIZE)*LOCAL_SIZE; size_t GolGlobalSize[2] = {linGlobal, linGlobal}; // Main game loop for (iter = 0; iter<maxIter; iter++) { err = clEnqueueNDRangeKernel(queue, k_ghostRows, 1, NULL, &cpyRowsGlobalSize, &cpyLocalSize, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, k_ghostCols, 1, NULL, &cpyColsGlobalSize, &cpyLocalSize, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, k_gol, 2, NULL, GolGlobalSize, GolLocalSize, 0, NULL, NULL); if(iter%2 == 1) { err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid); } else { err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_grid); } }// End main game loop if (err != CL_SUCCESS) { printf("Error: Failed to launch kernels%d\n",err); return EXIT_FAILURE; } // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device err = clEnqueueReadBuffer(queue, d_grid, CL_TRUE, 0, bytes, h_grid, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array\n"); return EXIT_FAILURE;; } // Sum up alive cells and print results int total = 0; for (i = 1; i<=dim; i++) { for (j = 1; j<=dim; j++) { total += h_grid[i*(dim+2)+j]; } } printf("Total Alive: %d\n", total); // Release memory free(h_grid); return 0; }
Changes
// Create the compute program from the kernel source file char *fileName = "GOL-kernels.cl"; FILE *fh = fopen(fileName, "r"); if(!fh) { printf("Error: Failed to open file\n"); return 0; } struct stat statbuf; stat(fileName, &statbuf); char *kernelSource = (char *) malloc(statbuf.st_size + 1); fread(kernelSource, statbuf.st_size, 1, fh); kernelSource[statbuf.st_size] = '\0'; program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program\n"); return EXIT_FAILURE; }
For more complicated kernels it is not reasonable to enter them as a char string in our c source file. Here we will read in the contents of GOL-kernels.cl which as we will see contains 3 OpenCL kernels.
// Set kernel local and global sizes size_t cpyRowsGlobalSize, cpyColsGlobalSize, cpyLocalSize; cpyLocalSize = LOCAL_SIZE; // Number of total work items - localSize must be devisor cpyRowsGlobalSize = (size_t)ceil(dim/(float)cpyLocalSize)*cpyLocalSize; cpyColsGlobalSize = (size_t)ceil((dim+2)/(float)cpyLocalSize)*cpyLocalSize;
The copy kernels use a linear work group size. The cpyLocalSize must be a devisor of the cpy*GlobalSize and so we calculate the smallest integer that covers our problem domain and is divisible by cpyLocalSize.
size_t GolLocalSize[2] = {LOCAL_SIZE, LOCAL_SIZE}; size_t linGlobal = (size_t)ceil(dim/(float)LOCAL_SIZE)*LOCAL_SIZE; size_t GolGlobalSize[2] = {linGlobal, linGlobal};
For the GOL kernel we specify a two dimensional work group size to better suite the problems geometry. The necessary global size is then be calculated.
// Main game loop for (iter = 0; iter<maxIter; iter++) { err = clEnqueueNDRangeKernel(queue, k_ghostRows, 1, NULL, &cpyRowsGlobalSize, &cpyLocalSize, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, k_ghostCols, 1, NULL, &cpyColsGlobalSize, &cpyLocalSize, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, k_gol, 2, NULL, GolGlobalSize, GolLocalSize, 0, NULL, NULL); if(iter%2 == 1) { err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid); } else { err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_grid); } }// End main game loop
OpenCL handles memory in terms of buffers, not pointers, so we are unable to preform a simple pointer swap. The process is relatively straight forward however as the argument order can be switched with a simple conditional statement inside of the main game loop.
GOL-kernels.cl
__kernel void ghostRows(const int dim, __global *grid) { // We want id to range from 1 to dim int id = get_global_id(0) + 1; if (id <= dim) { grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id]; //Copy first real row to bottom ghost row grid[id] = grid[(dim+2)*dim + id]; //Copy last real row to top ghost row } } __kernel void ghostCols(const int dim, __global *grid) { // We want id to range from 0 to dim+1 int id = get_global_id(0); if (id <= dim+1) { grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1]; //Copy first real column to right most ghost column grid[id*(dim+2)] = grid[id*(dim+2) + dim]; //Copy last real column to left most ghost column } } __kernel void GOL(const int dim, __global int *grid, __global int *newGrid) { int ix = get_global_id(0) + 1; int iy = get_global_id(1) + 1; int id = iy * (dim+2) + ix; int numNeighbors; if (iy <= dim && ix <= dim) { // Get the number of neighbors for a given grid point numNeighbors = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower + grid[id+1] + grid[id-1] //right left + grid[id+(dim+3)] + grid[id-(dim+3)] //diagonals + grid[id-(dim+1)] + grid[id+(dim+1)]; int cell = grid[id]; // Here we have explicitly all of the game rules if (cell == 1 && numNeighbors < 2) newGrid[id] = 0; else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3)) newGrid[id] = 1; else if (cell == 1 && numNeighbors > 3) newGrid[id] = 0; else if (cell == 0 && numNeighbors == 3) newGrid[id] = 1; else newGrid[id] = cell; } }
GOL-Local.c
In the first example each work item read nine values from global memory and wrote back one value. The inefficiency of this approach is that each value that is read from global memory is used not only in calculating its own new value but also by its eight closest neighbors in calculating their new value. Although some global memory access penalties have been alleviated in the latest generation of cards with the addition of an L2 cache you will find local memory is still an incredibly important aspect of GPU programming. Local memory provides a very fast memory space(comparable to register access) that is shared between work items of the same work-group. In the case where multiple work items in a particular work-group are accessing the same global memory item local memory may be used to reduce the number of global memory calls. A typical use for local memory is as follows:
- Have each work item read from global memory into local memory
- Synchronize work items to ensure all items in a given work-group have finished reading from global memory
- Have each work item preform calculations using local memory values
- Have each work item write result from local memory back to device global memory
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <CL/opencl.h> #include <sys/stat.h> #define SRAND_VALUE 1985 #define LOCAL_SIZE 16 int main(int argc, char* argv[]) { int i,j,iter; int *h_grid; cl_mem d_grid; cl_mem d_newGrid; cl_mem d_tmpGrid; // Linear game grid dimension int dim = 1024; // Number of game iterations int maxIter = 1<<10; // Size, in bytes, of each vector size_t bytes = sizeof(int)*(dim+2)*(dim+2); // Allocate host Grid used for initial setup and read back from device h_grid = (int*)malloc(bytes); cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program //Kernels cl_kernel k_gol, k_ghostRows, k_ghostCols; // Assign initial population randomly srand(SRAND_VALUE); for(i = 1; i<=dim; i++) { for(j = 1; j<=dim; j++) { h_grid[i*(dim+2)+j] = rand() % 2; } } cl_int err; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); if (err != CL_SUCCESS) { printf( "Error: Failed to find a platform\n"); return EXIT_FAILURE; } // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group\n"); return EXIT_FAILURE; } // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context\n"); return EXIT_FAILURE; } // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); if (!queue) { printf("Error: Failed to create a command commands\n"); return EXIT_FAILURE; } // Create the compute program from the kernel source file char *fileName = "GOL-kernels.cl"; FILE *fh = fopen(fileName, "r"); if(!fh) { printf("Error: Failed to open file\n"); return 0; } struct stat statbuf; stat(fileName, &statbuf); char *kernelSource = (char *) malloc(statbuf.st_size + 1); fread(kernelSource, statbuf.st_size, 1, fh); kernelSource[statbuf.st_size] = '\0'; program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to build program executable %d\n", err); return EXIT_FAILURE; } // Create the GOL kernel in the program we wish to run k_gol = clCreateKernel(program, "GOL", &err); if (!k_gol || err != CL_SUCCESS) { printf("Error: Failed to create GOL kernel\n"); return EXIT_FAILURE; } // Create the ghostRows kernel in the program we wish to run k_ghostRows = clCreateKernel(program, "ghostRows", &err); if (!k_ghostRows || err != CL_SUCCESS) { printf("Error: Failed to create ghostRows kernel\n"); return EXIT_FAILURE; } // Create the ghostCols kernel in the program we wish to run k_ghostCols = clCreateKernel(program, "ghostCols", &err); if (!k_ghostCols || err != CL_SUCCESS) { printf("Error: Failed to create ghostCols kernel\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation d_grid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL); d_newGrid = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, NULL); if (!d_grid || !d_newGrid) { printf("Error: Failed to allocate device memory\n"); return EXIT_FAILURE; } // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_grid, CL_TRUE, 0, bytes, h_grid, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array\n"); return EXIT_FAILURE; } // Set the arguments to GOL kernel err = clSetKernelArg(k_gol, 0, sizeof(int), &dim); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments\n"); return EXIT_FAILURE; } // Set the arguments to ghostRows kernel err = clSetKernelArg(k_ghostRows, 0, sizeof(int), &dim); err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments\n"); return EXIT_FAILURE; } // Set the arguments to ghostCols kernel err = clSetKernelArg(k_ghostCols, 0, sizeof(int), &dim); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments\n"); return EXIT_FAILURE; } // Set kernel local and global sizes size_t cpyRowsGlobalSize, cpyColsGlobalSize, cpyLocalSize; cpyLocalSize = LOCAL_SIZE; // Number of total work items - localSize must be devisor cpyRowsGlobalSize = (size_t)ceil(dim/(float)cpyLocalSize)*cpyLocalSize; cpyColsGlobalSize = (size_t)ceil((dim+2)/(float)cpyLocalSize)*cpyLocalSize; size_t GolLocalSize[2] = {LOCAL_SIZE, LOCAL_SIZE}; size_t linGlobal = (size_t)ceil(ceil(dim/(float)(LOCAL_SIZE-2))*(LOCAL_SIZE-2)/LOCAL_SIZE)*LOCAL_SIZE; size_t GolGlobalSize[2] = {linGlobal, linGlobal}; // Main game loop for (iter = 0; iter<maxIter; iter++) { err = clEnqueueNDRangeKernel(queue, k_ghostRows, 1, NULL, &cpyRowsGlobalSize, &cpyLocalSize, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, k_ghostCols, 1, NULL, &cpyColsGlobalSize, &cpyLocalSize, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, k_gol, 2, NULL, GolGlobalSize, GolLocalSize, 0, NULL, NULL); if(iter%2 == 1) { err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_grid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_newGrid); } else { err |= clSetKernelArg(k_ghostRows, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_ghostCols, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_gol, 1, sizeof(cl_mem), &d_newGrid); err |= clSetKernelArg(k_gol, 2, sizeof(cl_mem), &d_grid); } }// End main game loop if (err != CL_SUCCESS) { printf("Error: Failed to launch kernels%d\n",err); return EXIT_FAILURE; } // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device err = clEnqueueReadBuffer(queue, d_grid, CL_TRUE, 0, bytes, h_grid, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array\n"); return EXIT_FAILURE;; } // Sum up alive cells and print results int total = 0; for (i = 1; i<=dim; i++) { for (j = 1; j<=dim; j++) { total += h_grid[i*(dim+2)+j]; } } printf("Total Alive: %d\n", total); // Release memory free(h_grid); return 0; }
GOL-local-kernels.cl
#define LOCAL_SIZE_x 16 #define LOCAL_SIZE_y 16 __kernel void ghostRows(const int dim, __global int *grid) { // We want id to range from 1 to dim int id = get_global_id(0) + 1; if (id <= dim) { grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id]; //Copy first real row to bottom ghost row grid[id] = grid[(dim+2)*dim + id]; //Copy last real row to top ghost row } } __kernel void ghostCols(const int dim, __global int *grid) { // We want id to range from 0 to dim+1 int id = get_global_id(0); if (id <= dim+1) { grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1]; //Copy first real column to right most ghost column grid[id*(dim+2)] = grid[id*(dim+2) + dim]; //Copy last real column to left most ghost column } } __kernel void GOL(const int dim, __global int *grid, __global int *newGrid) { int ix = (get_local_size(0)-2) * get_group_id(0) + get_local_id(0); int iy = (get_local_size(1)-2) * get_group_id(1) + get_local_id(1); int id = iy * (dim+2) + ix; int i = get_local_id(0); int j = get_local_id(1); int numNeighbors; // Declare the local memory on a per work group level __local int s_grid[LOCAL_SIZE_y][LOCAL_SIZE_x]; // Copy cells into local memory if (ix <= dim+1 && iy <= dim+1) s_grid[i][j] = grid[id]; //Sync all work items in work group barrier(CLK_LOCAL_MEM_FENCE); if (iy <= dim && ix <= dim) { if(i != 0 && i !=LOCAL_SIZE_y-1 && j != 0 && j !=LOCAL_SIZE_x-1) { // Get the number of neighbors for a given grid point numNeighbors = s_grid[i+1][j] + s_grid[i-1][j] //upper lower + s_grid[i][j+1] + s_grid[i][j-1] //right left + s_grid[i+1][j+1] + s_grid[i-1][j-1] //diagonals + s_grid[i-1][j+1] + s_grid[i+1][j-1]; int cell = s_grid[i][j]; // Here we have explicitly all of the game rules if (cell == 1 && numNeighbors < 2) newGrid[id] = 0; else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3)) newGrid[id] = 1; else if (cell == 1 && numNeighbors > 3) newGrid[id] = 0; else if (cell == 0 && numNeighbors == 3) newGrid[id] = 1; else newGrid[id] = cell; } } }
Changes to GOL-local-kernels.cl
// Declare the local memory on a per work group level __local int s_grid[LOCAL_SIZE_y][LOCAL_SIZE_x];
The __local qualifier is used in the device kernel to allocate local memory arrays. The specified size is on a per work-group basis.
// Copy cells into local memory if (ix <= dim+1 && iy <= dim+1) s_grid[i][j] = grid[id];
Each work item that is within the bounds of our work group size, including ghost cells, will read one value from global memory into local memory.
//Sync all threads in work group barrier(CLK_LOCAL_MEM_FENCE);
Before we use any local memory we need to ensure that each work item in the work-group has finished reading its value from global memory into local memory by using a barrier, in this case the barrier is a work-group level barrier. When all work-items in a given work-group have reached the barrier execution will commence.
if (iy <= dim && ix <= dim) { if(i != 0 && i !=LOCAL_SIZE_y-1 && j != 0 && j !=LOCAL_SIZE_x-1) {
Before calculating our new values we must check that our global work item id’s, iy and ix, are do not extend into, or past, the ghost cells. Local work item id’s, i and j, must be checked so that no calculations are preformed in the per work-group ghost cells.
// Get the number of neighbors for a given grid point numNeighbors = s_grid[i+1][j] + s_grid[i-1][j] //upper lower + s_grid[i][j+1] + s_grid[i][j-1] //right left + s_grid[i+1][j+1] + s_grid[i-1][j-1] //diagonals + s_grid[i-1][j+1] + s_grid[i+1][j-1]; int cell = s_grid[i][j];
The standard GOL calculations are preformed only now we are reading all cell values from local memory instead of global memory.