In this tutorial we will cover three different CUDA versions of our GOL program, each highlighting a different CUDA memory area. The first will strictly use global memory and be a straightforward GPU port of our program, the second we will introduce shared memory, and lastly we cover the use of texture memory. A description of the game of life and the original CPU code can be found here. The full source can be viewed or downloaded at the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

GOL-CUDA.cu

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985
#define BLOCK_SIZE 16

__global__ void ghostRows(int dim, int *grid)
{
    // We want id ∈ [1,dim]
    int id = blockDim.x * blockIdx.x + threadIdx.x + 1;

    if (id <= dim)
    {
        //Copy first real row to bottom ghost row
        grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id];
        //Copy last real row to top ghost row
        grid[id] = grid[(dim+2)*dim + id];
    }
}

__global__ void ghostCols(int dim, int *grid)
{
    // We want id ∈ [0,dim+1]
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    if (id <= dim+1)
    {
        //Copy first real column to right most ghost column
        grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1];
        //Copy last real column to left most ghost column 
        grid[id*(dim+2)] = grid[id*(dim+2) + dim];    
    }
}

__global__ void GOL(int dim, int *grid, int *newGrid)
{
    // We want id ∈ [1,dim]
    int iy = blockDim.y * blockIdx.y + threadIdx.y + 1;
    int ix = blockDim.x * blockIdx.x + threadIdx.x + 1;
    int id = iy * (dim+2) + ix;

    int numNeighbors;

    if (iy <= dim && ix <= dim) {

        // Get the number of neighbors for a given grid point
        numNeighbors = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower
                     + grid[id+1] + grid[id-1]             //right left
                     + grid[id+(dim+3)] + grid[id-(dim+3)] //diagonals
                     + grid[id-(dim+1)] + grid[id+(dim+1)];

        int cell = grid[id];
        // Here we have explicitly all of the game rules
        if (cell == 1 && numNeighbors < 2)
            newGrid[id] = 0;
        else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
            newGrid[id] = 1;
        else if (cell == 1 && numNeighbors > 3)
            newGrid[id] = 0;
        else if (cell == 0 && numNeighbors == 3)
            newGrid[id] = 1;
        else
            newGrid[id] = cell;
    }
}

int main(int argc, char* argv[])
{
    int i,j,iter;
    int* h_grid; //Grid on host
    int* d_grid; //Grid on device
    int* d_newGrid; //Second grid used on device only
    int* d_tmpGrid; //tmp grid pointer used to switch between grid and newGrid

    int dim = 1024; //Linear dimension of our grid - not counting ghost cells
    int maxIter = 1<<10; //Number of game steps

    size_t bytes = sizeof(int)*(dim+2)*(dim+2);//2 added for periodic boundary condition ghost cells
    // Allocate host Grid used for initial setup and read back from device
    h_grid = (int*)malloc(bytes);

    // Allocate device grids
    cudaMalloc(&d_grid, bytes);
    cudaMalloc(&d_newGrid, bytes);

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    // Copy over initial game grid (Dim-1 threads)
    cudaMemcpy(d_grid, h_grid, bytes, cudaMemcpyHostToDevice);

    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE,1);
    int linGrid = (int)ceil(dim/(float)BLOCK_SIZE);
    dim3 gridSize(linGrid,linGrid,1);

    dim3 cpyBlockSize(BLOCK_SIZE,1,1);
    dim3 cpyGridRowsGridSize((int)ceil(dim/(float)cpyBlockSize.x),1,1);
    dim3 cpyGridColsGridSize((int)ceil((dim+2)/(float)cpyBlockSize.x),1,1);

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {

        ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
        ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
        GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);

        // Swap our grids and iterate again
        d_tmpGrid = d_grid;
        d_grid = d_newGrid;
        d_newGrid = d_tmpGrid;
    }//iter loop

    // Copy back results and sum
    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total);

    // Release memory
    cudaFree(d_grid);
    cudaFree(d_newGrid);
    free(h_grid);

    return 0;
}

Changes

#define BLOCK_SIZE 16

The number of threads per block is a common performance tuning parameter and it is often set as a precompiler define statement.

__global__ void ghostRows(int dim, int *grid)
{
    // We want id ∈ [1,dim]
    int id = blockDim.x * blockIdx.x + threadIdx.x + 1;
 
    if (id <= dim)
    {
        //Copy first real row to bottom ghost row
        grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id];
        //Copy last real row to top ghost row
        grid[id] = grid[(dim+2)*dim + id];
    }
}
__global__ void ghostCols(int dim, int *grid)
{
    // We want id ∈ [0,dim+1]
    int id = blockDim.x * blockIdx.x + threadIdx.x;
 
    if (id <= dim+1)
    {
        //Copy first real column to right most ghost column
        grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1];
       //Copy last real column to left most ghost column
        grid[id*(dim+2)] = grid[id*(dim+2) + dim];
    }
}

The ghostRows and ghostColumns kernels are used to fill in our ghost rows and columns in device global memory. Each thread is responsible for copying two integers.

__global__ void GOL(int dim, int *grid, int *newGrid)
{
    // We want id ∈ [1,dim]
    int iy = blockDim.y * blockIdx.y + threadIdx.y + 1;
    int ix = blockDim.x * blockIdx.x + threadIdx.x + 1;
    int id = iy * (dim+2) + ix;
 
    int numNeighbors;
 
    if (iy <= dim && ix <= dim) {
 
        // Get the number of neighbors for a given grid point
        numNeighbors = grid[id+(dim+2)] + grid[id-(dim+2)] //upper lower
                     + grid[id+1] + grid[id-1]             //right left
                     + grid[id+(dim+3)] + grid[id-(dim+3)] //diagonals
                     + grid[id-(dim+1)] + grid[id+(dim+1)];
 
        int cell = grid[id];
        // Here we have explicitly all of the game rules
        if (cell == 1 && numNeighbors < 2)
            newGrid[id] = 0;
        else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
            newGrid[id] = 1;
        else if (cell == 1 && numNeighbors > 3)
            newGrid[id] = 0;
        else if (cell == 0 && numNeighbors == 3)
            newGrid[id] = 1;
        else
            newGrid[id] = cell;
    }
}

The GOL kernel will preform the majority of our work. We use a two dimensional block and grid size to better fit the our problems geometry. Once the thread calculates its global id and tests that it is not out of bounds it will read one cell value from global memory, calculate the new value, and write the result back to global memory.

int* d_grid; //Grid on device
int* d_newGrid; //Second grid used on device only
int* d_tmpGrid; //tmp grid pointer used to switch between grid and newGrid

We declare our device pointers on the host as we would normally, we prefix our pointer name with d_ to help separate them from host pointers which we prefix with h_.

// Allocate device grids
cudaMalloc(&d_grid, bytes);
cudaMalloc(&d_newGrid, bytes);

Both of our device arrays are allocated in device global memory. A single dimensional array is used

// Copy over initial game grid (Dim-1 threads)
cudaMemcpy(d_grid, h_grid, bytes, cudaMemcpyHostToDevice);

After we have filled h_grid on the host we must copy it over to d_grid which resides in the device global memory.

dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE,1);
dim3 cpyBlockSize(BLOCK_SIZE,1,1);

The CUDA data type dim3 is used to define the number of threads in our block, For the GOL kernel we specify a two dimensional block size to better suite our problems geometry, for copying a simple single dimensional size is best. Note that any missing dimensions in the constructor are assumed to be 1.

int linGrid = (int)ceil(dim/(float)BLOCK_SIZE);
dim3 gridSize(linGrid,linGrid,1);
dim3 cpyGridRowsGridSize((int)ceil(dim/(float)cpyBlockSize.x),1,1);
dim3 cpyGridColsGridSize((int)ceil((dim+2)/(float)cpyBlockSize.x),1,1);

With our block size specified we must calculate the smallest integer number of blocks necessary to completely cover our game board, this is also of type dim3.

ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);

Each iteration of the game game loop we launch our three kernels, first copying the ghost cells and then preforming one game iteration.

// Swap our grids and iterate again
d_tmpGrid = d_grid;
d_grid = d_newGrid;
d_newGrid = d_tmpGrid;

The grid and newGrid device pointers can be switched just as we did with the host pointers in the CPU version.

// Copy back results and sum
cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);

Once our main game loop is complete we must copy the grid from device memory to host memory.

// Release memory
cudaFree(d_grid);
cudaFree(d_newGrid);

Having used cudaMalloc to allocate our device memory we release it with cudaFree.

Compiling

module load cudatoolkit
nvcc GOL-CUDA.cu -o GOL-CUDA.out

GOL-CUDA-Shared.cu

In the first example each thread read nine values from global memory and wrote back one value. The inefficiency of this approach is that each value that is read from global memory is used not only in calculating its own new value but also by its eight closest neighbors in calculating their new value. Although some global memory access penalties have been alleviated in the latest generation of cards with the addition of an L2 cache you will find shared memory is still an incredibly important aspect of GPU programming. Shared memory provides a very fast memory space(comparable to register access) that is shared between threads of the same thread block. In the case where multiple threads in a particular thread block are accessing the same global memory item shared memory may be used to reduce the number of global memory calls. A typical use for shared memory is as follows:

  1. Have each thread read from global memory into shared memory
  2. Synchronize threads to ensure all threads in a given block have finished reading from global memory
  3. Have each thread preform calculations using shared memory values
  4. Have each thread write result from shared memory back to device global memory

For detailed coverage of shared memory please see the CUDA C Programming guide.

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985
#define BLOCK_SIZE_x 32
#define BLOCK_SIZE_y 16

__global__ void ghostRows(int dim, int* grid)
{
    // We want id ∈ [1,dim]
    int id = blockDim.x * blockIdx.x + threadIdx.x + 1;

    if (id <= dim)
    {
        //Copy first real row to bottom ghost row
        grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id]; 
        //Copy last real row to top ghost row
        grid[id] = grid[(dim+2)*dim + id]; 
    }
}
__global__ void ghostCols(int dim, int* grid)
{
    // We want id ∈ [0,dim+1]
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    if (id <= dim+1)
    {
        //Copy first real column to right most ghost column
        grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1]; 
        //Copy last real column to left most ghost column
        grid[id*(dim+2)] = grid[id*(dim+2) + dim];                                                  
    }
}

__global__ void GOL(int dim, int *grid, int *newGrid)
{
        int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;
        int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;
        int id = iy * (dim+2) + ix;

        int i = threadIdx.y;
        int j = threadIdx.x;
        int numNeighbors;

        // Declare the shared memory on a per block level
        __shared__ int s_grid[BLOCK_SIZE_y][BLOCK_SIZE_x];

       // Copy cells into shared memory
       if (ix <= dim+1 && iy <= dim+1)
           s_grid[i][j] = grid[id];

       //Sync all threads in block
        __syncthreads();

       if (iy <= dim && ix <= dim) {
           if(i != 0 && i !=blockDim.y-1 && j != 0 && j !=blockDim.x-1) {

               // Get the number of neighbors for a given grid point
               numNeighbors = s_grid[i+1][j] + s_grid[i-1][j] //upper lower
                            + s_grid[i][j+1] + s_grid[i][j-1] //right left
                            + s_grid[i+1][j+1] + s_grid[i-1][j-1] //diagonals
                            + s_grid[i-1][j+1] + s_grid[i+1][j-1];

                int cell = s_grid[i][j];

                // Here we have explicitly all of the game rules
                if (cell == 1 && numNeighbors < 2)
                    newGrid[id] = 0;
                else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
                    newGrid[id] = 1;
                else if (cell == 1 && numNeighbors > 3)
                    newGrid[id] = 0;
                else if (cell == 0 && numNeighbors == 3)
                    newGrid[id] = 1;
                else
                    newGrid[id] = cell;
           }
       }

}

int main(int argc, char* argv[])
{
    int i,j,iter;
    int *h_grid; //Grid on host
    int *d_grid; //Grid on device
    int *d_newGrid; //Second grid used on device only
    int *d_tmpGrid; //tmp grid pointer used to switch between grid and newGrid

    int dim = 1024; //Linear dimension of our grid - not counting ghost cells
    int maxIter = 1<<10; //Number of game steps

    size_t bytes = sizeof(int)*(dim+2)*(dim+2);
    // Allocate host Grid used for initial setup and read back from device
    h_grid = (int*)malloc(bytes);

    // Allocate device grids
    cudaMalloc(&d_grid, bytes);
    cudaMalloc(&d_newGrid, bytes);

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    cudaFuncSetCacheConfig(GOL, cudaFuncCachePreferShared);

    // Copy over initial game grid (Dim-1 threads)
    cudaMemcpy(d_grid, h_grid, bytes, cudaMemcpyHostToDevice);

    dim3 blockSize(BLOCK_SIZE_x, BLOCK_SIZE_y,1);
    int linGrid_x = (int)ceil(dim/(float)(BLOCK_SIZE_x-2));
    int linGrid_y = (int)ceil(dim/(float)(BLOCK_SIZE_y-2));
    dim3 gridSize(linGrid_x,linGrid_y,1);

    dim3 cpyBlockSize(BLOCK_SIZE_x,1,1);
    dim3 cpyGridRowsGridSize((int)ceil(dim/(float)cpyBlockSize.x),1,1);
    dim3 cpyGridColsGridSize((int)ceil((dim+2)/(float)cpyBlockSize.x),1,1);

    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {

        ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
        ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
        GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);

        // Swap our grids and iterate again
        d_tmpGrid = d_grid;
        d_grid = d_newGrid;
        d_newGrid = d_tmpGrid;
    }//iter loop

    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess)
        printf("CUDA error %s\n",cudaGetErrorString(error));

    // Copy back results and sum
    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);

    // Sum up alive cells and print results
    int total = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total);

    cudaFree(d_grid);
    cudaFree(d_newGrid);
    free(h_grid);

    return 0;
}

Changes :

#define BLOCK_SIZE_x 32
#define BLOCK_SIZE_y 16

A square block size is not always best performance wise, for greater flexibility we switch to separate x and y dimensions.

int iy = (blockDim.y -2) * blockIdx.y + threadIdx.y;
int ix = (blockDim.x -2) * blockIdx.x + threadIdx.x;
int id = iy * (dim+2) + ix;

Extra care has to be taken when determining our global thread id for the game kernel. Each thread block will contain two ghost rows and two ghost columns which must be accounted for to calculate their global indexesidx and idy.

int i = threadIdx.y;
int j = threadIdx.x;

We will also define some convenience variables for the block thread indices.

// Declare the shared memory on a per block level
__shared__ int s_grid[blockDim.y][blockDim.x];

The __shared__ qualifier is used in the device kernel to allocate shared memory arrays. The specified size is on a per block basis.

// Copy cells into shared memory
if (ix <= dim+1 && iy <= dim+1)
s_grid[i][j] = grid[id];

Each thread that is within the bounds of our grid, including ghost cells, will read one value from global memory into shared memory.

//Sync all threads in block
__syncthreads();

Before we use any shared memory we need to ensure that each thread in the block has finished reading its value from global memory into shared memory by calling __syncThreads, this is a thread block level barrier. When all threads in a given block have reached the barrier execution will commence.

if (iy <= dim && ix <= dim) {
    if(i != 0 && i !=blockDim.y-1 && j != 0 && j !=blockDim.x-1) {

Before calculating our new values we must check that our global thread id’s, iy and ix, are do not extend into, or past, the ghost cells. Local thread id’s, i and j, must be checked so that no calculations are preformed in the per block ghost cells.

// Get the number of neighbors for a given grid point
numNeighbors = s_grid[i+1][j] + s_grid[i-1][j]     //upper lower
            + s_grid[i][j+1] + s_grid[i][j-1]      //right left
            + s_grid[i+1][j+1] + s_grid[i-1][j-1] //diagonals
            + s_grid[i-1][j+1] + s_grid[i+1][j-1];

int cell = s_grid[i][j];

The standard GOL calculations are preformed only now we are reading all cell values from shared memory instead of global memory.

dim3 blockSize(BLOCK_SIZE_x, BLOCK_SIZE_y,1);
int linGrid_x = (int)ceil(dim/(float)(BLOCK_SIZE_x-2));
int linGrid_y = (int)ceil(dim/(float)(BLOCK_SIZE_y-2));
dim3 gridSize(linGrid_x,linGrid_y,1);

Slight modification is needed to set the correct block and grid size since we are now using two independent dimensions.

GOL-CUDA-Texture.cu

The texture cache example is provided for completeness but not fully documented. The texture cache is optimized for 2D spacial locality and does not pollute the L1 cache. Some programs may benefit from the smaller cache line compared to the L1 cache.

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985

texture<int,2> gridTex;

__global__ void GOL(int dim, int *newGrid)
{
    int iy = blockDim.y * blockIdx.y + threadIdx.y;
    int ix = blockDim.x * blockIdx.x + threadIdx.x;
    int id = iy * dim + ix;

    int numNeighbors;

    float iyTex = (iy + 0.5f)/dim;
    float ixTex = (ix + 0.5f)/dim;
    float oneTex = 1.0f/dim;

    if(iy < dim && ix < dim)
{
    //Get the number of neighbors for a given grid point
    numNeighbors = tex2D(gridTex, iyTex+oneTex, ixTex) //upper/lower
                 + tex2D(gridTex, iyTex-oneTex, ixTex)
                 + tex2D(gridTex, iyTex, ixTex+oneTex) //right/left
                 + tex2D(gridTex, iyTex, ixTex-oneTex)
                 + tex2D(gridTex, iyTex-oneTex, ixTex-oneTex) //diagonals
                 + tex2D(gridTex, iyTex-oneTex, ixTex+oneTex)
                 + tex2D(gridTex, iyTex+oneTex, ixTex-oneTex) 
                 + tex2D(gridTex, iyTex+oneTex, ixTex+oneTex);

    int cell = tex2D(gridTex, iyTex, ixTex);

    //Here we have explicitly all of the game rules
    if (cell == 1 && numNeighbors < 2)
        newGrid[id] = 0;
    else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
        newGrid[id] = 1;
    else if (cell == 1 && numNeighbors > 3)
        newGrid[id] = 0;
    else if (cell == 0 && numNeighbors == 3)
         newGrid[id] = 1;
    else
       newGrid[id] = cell;

}
}

int main(int argc, char* argv[])
{
    int i,j,iter;
    int* h_grid; //Grid on host
    cudaArray* d_grid; //Grid on device
    int* d_newGrid; //Second grid used on device only

    int dim = 1024; //Linear dimension of our grid - not counting ghost cells
    int maxIter = 1<<10; //Number of game steps

    size_t bytes = sizeof(int)*dim*dim;
    //Allocate host Grid used for initial setup and read back from device
    h_grid = (int*)malloc(bytes);

    //Allocate device grids
    cudaMallocArray(&d_grid, &gridTex.channelDesc, dim, dim);
    cudaMalloc(&d_newGrid, bytes);

    //Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 0; i<dim; i++) {
        for(j = 0; j<dim; j++) {
            h_grid[i*dim+j] = rand() % 2;
        }
    }

    //Copy over initial game grid (Dim-1 threads)
    cudaMemcpyToArray (d_grid, 0, 0, h_grid, bytes, cudaMemcpyHostToDevice);
    cudaBindTextureToArray(gridTex, d_grid);

    gridTex.normalized = true;
    gridTex.addressMode[0] = cudaAddressModeWrap;
    gridTex.addressMode[1] = cudaAddressModeWrap;

    dim3 dimBlock(8,8);
    int linGrid = (int)ceil(dim/(float)dimBlock.x);
    dim3 dimGrid(linGrid,linGrid);

    //Main game loop
    for (iter = 0; iter<maxIter; iter++) {
        GOL<<<dimGrid,dimBlock>>>(dim, d_newGrid);

        //Swap our grids and iterate again
        cudaMemcpyToArray (d_grid, 0, 0, d_newGrid, bytes, cudaMemcpyDeviceToDevice);
    }//iter loop

    //Copy back results and sum
    cudaMemcpy(h_grid, d_newGrid, bytes, cudaMemcpyDeviceToHost);

    //Sum up alive cells and print results
    int total = 0;
    for (i = 0; i<dim; i++) {
        for (j = 0; j<dim; j++) {
            total += h_grid[i*dim+j];
        }
    }
    printf("Total Alive: %d\n", total);

    cudaFree(d_grid);
    cudaFree(d_newGrid);
    free(h_grid);

    return 0;
}