For this tutorial we will cover two different CUDA Fortran versions of our Game of Life program. The first will strictly use global memory and be a straight forward GPU port of our program, the second will introduce the use of shared memory. For a description of the Game of Life and the original CPU code please see the tutorial here. The full source can be viewed or downloaded at the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

GOL.cuf

module kernels
    contains

    attributes(global) subroutine ghostCols_kernel(dim, grid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid
        integer :: id

        !We want id ∈ [2,dim+1]
        !Get Global thread ID
        id = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1

        if(id < dim+2) then
            grid(id, dim+2) = grid(id, 2) !Copy first real column to right most ghost column
            grid(id, 1) = grid(id, dim+1) !Copy last real column to left most ghost column
        endif

    end subroutine ghostCols_kernel

    attributes(global) subroutine ghostRows_kernel(dim, grid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid
        integer :: id

        !We want id ∈ [1,dim+2]
        !Get Global thread ID
        id = (blockIdx%x-1) * blockDim%x + threadIdx%x

        if(id <= dim+2) then
            grid(dim+2, id) = grid(2, id) !Copy first real row to bottom ghost row
            grid(1, id) = grid(dim+1 ,id) !Copy last real row to top ghost row
        endif
    end subroutine ghostRows_kernel

    attributes(global) subroutine GOL_kernel(dim, grid, newGrid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid, newGrid
        integer :: idx, idy, id, numNeighbors, cell

        !Get global thread IDs
        idx = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1
        idy = (blockIdx%y-1) * blockDim%y + threadIdx%y + 1

        if(idx < dim+2 .AND. idy < dim+2) then
            numNeighbors = grid(idy,idx+1) + grid(idy,idx-1) &     !right & left
                         + grid(idy+1,idx) + grid(idy-1,idx) &     !upper & lower
                         + grid(idy+1,idx+1) + grid(idy-1,idx-1) & !diagonals
                         + grid(idy-1,idx+1) + grid(idy+1,idx-1)

            cell = grid(idy,idx)
            !Here we have explicitly all of the game rules
            if(cell == 1 .AND. numNeighbors < 2) then
                newGrid(idy,idx) = 0
            elseif(cell == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                newGrid(idy,idx) = 1
            elseif(cell == 1 .AND. numNeighbors > 3) then
                newGrid(idy,idx) = 0
            elseif(cell == 0 .AND. numNeighbors == 3) then
                newGrid(idy,idx) = 1
            else
                newGrid(idy,idx) = cell
            endif
        endif

    end subroutine GOL_kernel
end module kernels

program main
    use cudafor
    use kernels

    implicit none

    integer :: i,j,iter,seed(8),numNeighbors,total
    real :: randm
    !Linear game grid dimension
    integer :: dim = 1024
    !Number of game iterations
    integer :: maxIter = 2**10
    integer :: blockDim = 16
    type(dim3) :: cpyBlockSize, cpyColsGridSize, cpyRowsGridSize, blockSize, gridSize

    !Host game grid array
    integer,dimension(:,:),allocatable :: h_grid

    !Device game grid arrays
    integer,device,dimension(:,:),allocatable :: d_grid, d_newGrid

    !Allocate square grid of (dim+2)^2 elements on host, 2 added for ghost cells
    allocate(h_grid(dim+2,dim+2))

    !Allocate grid on device
    allocate(d_grid(dim+2,dim+2))
    allocate(d_newGrid(dim+2,dim+2))

    !Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            h_grid(i,j) = nint(randm)
        enddo
    enddo

    !Implicit copy of host vectors to device
    d_grid = h_grid

    !Set the number of threads per block and blocks per grid
    cpyBlockSize    = dim3(blockDim, 1, 1)
    cpyColsGridSize = dim3(ceiling(real(dim)/real(cpyBlockSize%x)), 1, 1)
    cpyRowsGridSize = dim3(ceiling(real(dim+2)/real(cpyBlockSize%x)), 1, 1)

    blockSize   = dim3(blockDim, blockDim, 1)
    gridSize    = dim3(ceiling(real(dim)/real(blockSize%x)), &
                         ceiling(real(dim)/real(blockSize%y)), 1)

    !Main game loop
    do iter=1,maxIter
        !device pointers are not supported currently so manually switch grid/newGrid
        if(MOD(iter,2) == 1) then
            call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_grid)
            call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_grid)
            call GOL_kernel<<<gridSize,blockSize>>>(dim, d_grid, d_newGrid)
        else
            call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_newGrid)
            call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_newGrid)
            call GOL_kernel<<<gridSize,blockSize>>>(dim, d_newGrid, d_grid)
        endif
    enddo

    !Implicit copy of device vector back to host
    h_grid = d_grid

    !Sum up alive cells and print results
    total = 0
    do j=2,dim+1
        do i=2,dim+1
            total = total + h_grid(i,j)
        enddo
    enddo
    print *, "Total Alive", total

    !Release memory
    deallocate(h_grid)
    deallocate(d_grid)
    deallocate(d_newGrid)

end program

Changes to GOL.cuf

    attributes(global) subroutine ghostCols_kernel(dim, grid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid
        integer :: id

        !We want id ∈ [2,dim+1]
        !Get Global thread ID
        id = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1

        if(id < dim+2) then
            grid(id, dim+2) = grid(id, 2) !Copy first real column to right most ghost column
            grid(id, 1) = grid(id, dim+1) !Copy last real column to left most ghost column
        endif

    end subroutine ghostCols_kernel

    attributes(global) subroutine ghostRows_kernel(dim, grid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid
        integer :: id

        !We want id ∈ [1,dim+2]
        !Get Global thread ID
        id = (blockIdx%x-1) * blockDim%x + threadIdx%x

        if(id <= dim+2) then
            grid(dim+2, id) = grid(2, id) !Copy first real row to bottom ghost row
            grid(1, id) = grid(dim+1 ,id) !Copy last real row to top ghost row
        endif
    end subroutine ghostRows_kernel

The ghostRows_kernel and ghostColumns_kernel device subroutines are used to fill in our ghost rows and columns in device global memory. Each thread is responsible for copying two integers.

    attributes(global) subroutine GOL_kernel(dim, grid, newGrid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid, newGrid
        integer :: idx, idy, id, numNeighbors, cell

        !Get global thread IDs
        idx = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1
        idy = (blockIdx%y-1) * blockDim%y + threadIdx%y + 1

        if(idx < dim+2 .AND. idy < dim+2) then
            numNeighbors = grid(idy,idx+1) + grid(idy,idx-1) &     !right & left
                         + grid(idy+1,idx) + grid(idy-1,idx) &     !upper & lower
                         + grid(idy+1,idx+1) + grid(idy-1,idx-1) & !diagonals
                         + grid(idy-1,idx+1) + grid(idy+1,idx-1)

            cell = grid(idy,idx)
            !Here we have explicitly all of the game rules
            if(cell == 1 .AND. numNeighbors < 2) then
                newGrid(idy,idx) = 0
            elseif(cell == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                newGrid(idy,idx) = 1
            elseif(cell == 1 .AND. numNeighbors > 3) then
                newGrid(idy,idx) = 0
            elseif(cell == 0 .AND. numNeighbors == 3) then
                newGrid(idy,idx) = 1
            else
                newGrid(idy,idx) = cell
            endif
        endif

    end subroutine GOL_kernel

The GOL kernel will preform the majority of our work. We use a two dimensional block and grid size to better fit the our problems geometry. Once the thread calculates its global id and tests that it is not out of bounds it will read one cell value from global memory, calculate the new value, and write the result back to global memory.

    !Host game grid array
    integer,dimension(:,:),allocatable :: h_grid
 
    !Device game grid arrays
    integer,device,dimension(:,:),allocatable :: d_grid, d_newGrid

Currently CUDA Fortran does not support device pointers and so we change our array attribute from pointer to allocatable.

    !Allocate grid on device
    allocate(d_grid(dim+2,dim+2))
    allocate(d_newGrid(dim+2,dim+2))

Given a array with the device attribute memory will be allocated in the GPUs global memory space.

    !Implicit copy of host vectors to device
    d_grid = h_grid

By setting the device array equal to a host array a host to device DMA copy is implicitly initiated.

    !Set the number of threads per block and blocks per grid
    cpyBlockSize    = dim3(blockDim, 1, 1)
    cpyColsGridSize = dim3(ceiling(real(dim)/real(cpyBlockSize%x)), 1, 1)
    cpyRowsGridSize = dim3(ceiling(real(dim+2)/real(cpyBlockSize%x)), 1, 1)
 
    blockSize   = dim3(blockDim, blockDim, 1)
    gridSize    = dim3(ceiling(real(dim)/real(blockSize%x)), &
                         ceiling(real(dim)/real(blockSize%y)), 1)

With our block size specified we must calculate the smallest integer number of blocks necessary to completely cover our game board, this is also of the CUDA Fortran data type dim3.

    !Main game loop
    do iter=1,maxIter
        !device pointers are not supported currently so manually switch grid/newGrid
        if(MOD(iter,2) == 1) then
            call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_grid)
            call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_grid)
            call GOL_kernel<<<gridSize,blockSize>>>(dim, d_grid, d_newGrid)
        else
            call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_newGrid)
            call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_newGrid)
            call GOL_kernel<<<gridSize,blockSize>>>(dim, d_newGrid, d_grid)
        endif
    enddo

Although we can not simply switch our device grid pointers at the end of each game iteration due to CUDA Fortran not support device pointers we may switch what arguments we call our kernels with. This achieves the same effect with a minimal increase in code size.

    !Implicit copy of device vector back to host
    h_grid = d_grid

After our kernel has completed we can implicitly copy the grid array from the device back to the host.

GOL-Shared.cuf

In the first example each thread read nine values from global memory and wrote back one value. The inefficiency of this approach is that each value that is read from global memory is used not only in calculating its own new value but also by its eight closest neighbors in calculating their new value. Although some global memory access penalties have been alleviated in the latest generation of cards with the addition of an L2 cache you will find shared memory is still an incredibly important aspect of GPU programming. Shared memory provides a very fast memory space(comparable to register access) that is shared between threads of the same thread block. In the case where multiple threads in a particular thread block are accessing the same global memory item shared memory may be used to reduce the number of global memory calls. A typical use for shared memory is as follows:

  1. Have each thread read from global memory into shared memory
  2. Synchronize threads to ensure all threads in a given block have finished reading from global memory
  3. Have each thread preform calculations using shared memory values
  4. Have each thread write result from shared memory back to device global memory

For detailed coverage of shared memory please see the CUDA C Programming Guide and CUDA Fortran Programming Guide.

module kernel
    contains

    attributes(global) subroutine ghostCols_kernel(dim, grid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid
        integer :: id

        !We want id ?~H~H [2,dim+1]
        !Get Global thread ID
        id = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1

        if(id < dim+2) then
            grid(id, dim+2) = grid(id, 2) !Copy first real column to right most ghost column
            grid(id, 1) = grid(id, dim+1) !Copy last real column to left most ghost column
        endif

    end subroutine ghostCols_kernel

    attributes(global) subroutine ghostRows_kernel(dim, grid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid
        integer :: id

        !We want id ?~H~H [1,dim+2]
        !Get Global thread ID
        id = (blockIdx%x-1) * blockDim%x + threadIdx%x

        if(id <= dim+2) then
            grid(dim+2, id) = grid(2, id) !Copy first real row to bottom ghost row
            grid(1, id) = grid(dim+1 ,id) !Copy last real row to top ghost row
        endif
    end subroutine ghostRows_kernel

    attributes(global) subroutine GOL_kernel(dim, grid, newGrid)
        integer,value :: dim
        integer,device,dimension(dim+2,dim+2) :: grid, newGrid
        integer :: idx, idy, id, i, j, numNeighbors, cell

        !Declare the shared memory on a per block level
        integer,shared :: s_grid(16, 16)

        !Get global thread IDs
        idx = (blockIdx%x-1) * (blockDim%x-2) + threadIdx%x
        idy = (blockIdx%y-1) * (blockDim%y-2) + threadIdx%y

        !Get local thread IDs
        i = threadIdx%y
        j = threadIdx%x

        !Copy cells into shared memory
        if (idx <= dim+2 .AND. idy <= dim+2) then
            s_grid(i,j) = grid(idy,idx)
        endif

        !Sync all threads in block
        call syncthreads()

        if (idx < dim+2 .AND. idy < dim+2) then
            if (i /= 1 .AND. i /=blockDim%y .AND. j /=1 .AND. j /=blockDim%x) then

                numNeighbors = s_grid(i,j+1) + s_grid(i,j-1) &     !right & left
                             + s_grid(i+1,j) + s_grid(i-1,j) &     !upper & lower
                             + s_grid(i+1,j+1) + s_grid(i-1,j-1) & !diagonals
                             + s_grid(i-1,j+1) + s_grid(i+1,j-1)

                cell = s_grid(i,j)

                !Here we have explicitly all of the game rules
                if(cell == 1 .AND. numNeighbors < 2) then
                    newGrid(idy,idx) = 0
                elseif(cell == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                    newGrid(idy,idx) = 1
                elseif(cell == 1 .AND. numNeighbors > 3) then
                    newGrid(idy,idx) = 0
                elseif(cell == 0 .AND. numNeighbors == 3) then
                    newGrid(idy,idx) = 1
                else
                    newGrid(idy,idx) = cell
                endif

            endif
        endif

    end subroutine GOL_kernel
end module kernel

program main
    use cudafor
    use kernel

    implicit none

    integer :: i,j,iter,seed(8),numNeighbors,total
    real :: randm
    !Linear game grid dimension
    integer :: dim = 1024
    !Number of game iterations
    integer :: maxIter = 2**10
    integer :: blockDim = 16
    type(dim3) :: cpyBlockSize, cpyColsGridSize, cpyRowsGridSize, blockSize, gridSize

    !Host game grid array
    integer,dimension(:,:),allocatable :: h_grid

    !Device game grid array
    integer,device,dimension(:,:),allocatable :: d_grid, d_newGrid

    !Allocate square grid of (dim+2)^2 elements on host, 2 added for ghost cells
    allocate(h_grid(dim+2,dim+2))

    !Allocate grid on device
    allocate(d_grid(dim+2,dim+2))
    allocate(d_newGrid(dim+2,dim+2))

    !Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            h_grid(i,j) = nint(randm)
        enddo
    enddo

    !Implicit copy of host vectors to device
    d_grid = h_grid

    !Set the number of threads per block and blocks per grid
    cpyBlockSize    = dim3(blockDim, 1, 1)
    cpyColsGridSize = dim3(ceiling(real(dim)/real(cpyBlockSize%x)), 1, 1)
    cpyRowsGridSize = dim3(ceiling(real(dim+2)/real(cpyBlockSize%x)), 1, 1)

    blockSize   = dim3(blockDim-2, blockDim-2, 1)
    gridSize    = dim3(ceiling(real(dim)/real(blockSize%x-2)), &
                    ceiling(real(dim)/real(blockSize%y-2)), 1)

    !Main game loop
    do iter=1,maxIter
        !device pointers are not supported currently, so manually switch grid/newGrid
        if(MOD(iter,2) == 1) then
            call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_grid)
            call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_grid)
            call GOL_kernel<<<gridSize,blockSize>>>(dim, d_grid, d_newGrid)
        else
            call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_newGrid)
            call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_newGrid)
            call GOL_kernel<<<gridSize,blockSize>>>(dim, d_newGrid, d_grid)
        endif
    enddo

    !Implicit copy of device vector back to host
    h_grid = d_grid

    !Sum up alive cells and print results
    total = 0
    do j=2,dim+1
        do i=2,dim+1
            total = total + h_grid(i,j)
        enddo
    enddo
    print *, "Total Alive", total

    !Release memory
    deallocate(h_grid)
    deallocate(d_grid)
    deallocate(d_newGrid)

end program

Changes to GOL-Shared.cuf

!Declare the shared memory on a per block level
integer,shared :: s_grid(16, 16)

The shared attribute is used in the device kernel to allocate shared memory arrays. The specified size is on a per block basis

!Get global thread IDs
idx = (blockIdx%x-1) * (blockDim%x-2) + threadIdx%x
idy = (blockIdx%y-1) * (blockDim%y-2) + threadIdx%y

Extra care has to be taken when determining our global thread id for the game kernel. Each thread block will contain two ghost rows and two ghost columns which must be accounted for to calculate their global indexesidx and idy.

!Get local thread IDs
i = threadIdx%y
j = threadIdx%x

We define some convenience variables for the block thread indices.

!Copy cells into shared memory
if (idx <= dim+2 .AND. idy <= dim+2) then
    s_grid(i,j) = grid(idy,idx)
endif

Each thread that is within the bounds of our grid, including ghost cells, will read one value from global memory into shared memory.

!Sync all threads in block
call syncthreads()

Before we use any shared memory we need to ensure that each thread in the block has finished reading its value from global memory into shared memory by calling syncThreads, this is a thread block level barrier. When all threads in a given block have reached the barrier execution will commence.

if (idx < dim+2 .AND. idy < dim+2) then
    if (i /= 1 .AND. i /=blockDim%y .AND. j /=1 .AND. j /=blockDim%x) then

Before calculating our new values we must check that our global thread id’s, iy and ix, are do not extend into, or past, the ghost cells. Local thread id’s, i and j, must be checked so that no calculations are preformed in the per block ghost cells.

numNeighbors = s_grid(i,j+1) + s_grid(i,j-1) &     !right & left
             + s_grid(i+1,j) + s_grid(i-1,j) &     !upper & lower
             + s_grid(i+1,j+1) + s_grid(i-1,j-1) & !diagonals
             + s_grid(i-1,j+1) + s_grid(i+1,j-1)
 
cell = s_grid(i,j)

The standard GOL calculations are preformed only now we are reading all cell values from shared memory instead of global memory.

    blockSize   = dim3(blockDim-2, blockDim-2, 1)
    gridSize    = dim3(ceiling(real(dim)/real(blockSize%x-2)), &
                    ceiling(real(dim)/real(blockSize%y-2)), 1)

Slight modification is needed to set the correct block and grid size since we are now using two independent dimensions.