For this tutorial we will cover two different CUDA Fortran versions of our Game of Life program. The first will strictly use global memory and be a straight forward GPU port of our program, the second will introduce the use of shared memory. For a description of the Game of Life and the original CPU code please see the tutorial here. The full source can be viewed or downloaded at the OLCF GitHub. Please direct any questions or comments to help@nccs.gov
GOL.cuf
module kernels contains attributes(global) subroutine ghostCols_kernel(dim, grid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid integer :: id !We want id ∈ [2,dim+1] !Get Global thread ID id = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1 if(id < dim+2) then grid(id, dim+2) = grid(id, 2) !Copy first real column to right most ghost column grid(id, 1) = grid(id, dim+1) !Copy last real column to left most ghost column endif end subroutine ghostCols_kernel attributes(global) subroutine ghostRows_kernel(dim, grid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid integer :: id !We want id ∈ [1,dim+2] !Get Global thread ID id = (blockIdx%x-1) * blockDim%x + threadIdx%x if(id <= dim+2) then grid(dim+2, id) = grid(2, id) !Copy first real row to bottom ghost row grid(1, id) = grid(dim+1 ,id) !Copy last real row to top ghost row endif end subroutine ghostRows_kernel attributes(global) subroutine GOL_kernel(dim, grid, newGrid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid, newGrid integer :: idx, idy, id, numNeighbors, cell !Get global thread IDs idx = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1 idy = (blockIdx%y-1) * blockDim%y + threadIdx%y + 1 if(idx < dim+2 .AND. idy < dim+2) then numNeighbors = grid(idy,idx+1) + grid(idy,idx-1) & !right & left + grid(idy+1,idx) + grid(idy-1,idx) & !upper & lower + grid(idy+1,idx+1) + grid(idy-1,idx-1) & !diagonals + grid(idy-1,idx+1) + grid(idy+1,idx-1) cell = grid(idy,idx) !Here we have explicitly all of the game rules if(cell == 1 .AND. numNeighbors < 2) then newGrid(idy,idx) = 0 elseif(cell == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then newGrid(idy,idx) = 1 elseif(cell == 1 .AND. numNeighbors > 3) then newGrid(idy,idx) = 0 elseif(cell == 0 .AND. numNeighbors == 3) then newGrid(idy,idx) = 1 else newGrid(idy,idx) = cell endif endif end subroutine GOL_kernel end module kernels program main use cudafor use kernels implicit none integer :: i,j,iter,seed(8),numNeighbors,total real :: randm !Linear game grid dimension integer :: dim = 1024 !Number of game iterations integer :: maxIter = 2**10 integer :: blockDim = 16 type(dim3) :: cpyBlockSize, cpyColsGridSize, cpyRowsGridSize, blockSize, gridSize !Host game grid array integer,dimension(:,:),allocatable :: h_grid !Device game grid arrays integer,device,dimension(:,:),allocatable :: d_grid, d_newGrid !Allocate square grid of (dim+2)^2 elements on host, 2 added for ghost cells allocate(h_grid(dim+2,dim+2)) !Allocate grid on device allocate(d_grid(dim+2,dim+2)) allocate(d_newGrid(dim+2,dim+2)) !Assign initial population randomly seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/) call random_seed(PUT=seed) do j=1,dim do i=1,dim call random_number(randm) h_grid(i,j) = nint(randm) enddo enddo !Implicit copy of host vectors to device d_grid = h_grid !Set the number of threads per block and blocks per grid cpyBlockSize = dim3(blockDim, 1, 1) cpyColsGridSize = dim3(ceiling(real(dim)/real(cpyBlockSize%x)), 1, 1) cpyRowsGridSize = dim3(ceiling(real(dim+2)/real(cpyBlockSize%x)), 1, 1) blockSize = dim3(blockDim, blockDim, 1) gridSize = dim3(ceiling(real(dim)/real(blockSize%x)), & ceiling(real(dim)/real(blockSize%y)), 1) !Main game loop do iter=1,maxIter !device pointers are not supported currently so manually switch grid/newGrid if(MOD(iter,2) == 1) then call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_grid) call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_grid) call GOL_kernel<<<gridSize,blockSize>>>(dim, d_grid, d_newGrid) else call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_newGrid) call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_newGrid) call GOL_kernel<<<gridSize,blockSize>>>(dim, d_newGrid, d_grid) endif enddo !Implicit copy of device vector back to host h_grid = d_grid !Sum up alive cells and print results total = 0 do j=2,dim+1 do i=2,dim+1 total = total + h_grid(i,j) enddo enddo print *, "Total Alive", total !Release memory deallocate(h_grid) deallocate(d_grid) deallocate(d_newGrid) end program
Changes to GOL.cuf
attributes(global) subroutine ghostCols_kernel(dim, grid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid integer :: id !We want id ∈ [2,dim+1] !Get Global thread ID id = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1 if(id < dim+2) then grid(id, dim+2) = grid(id, 2) !Copy first real column to right most ghost column grid(id, 1) = grid(id, dim+1) !Copy last real column to left most ghost column endif end subroutine ghostCols_kernel attributes(global) subroutine ghostRows_kernel(dim, grid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid integer :: id !We want id ∈ [1,dim+2] !Get Global thread ID id = (blockIdx%x-1) * blockDim%x + threadIdx%x if(id <= dim+2) then grid(dim+2, id) = grid(2, id) !Copy first real row to bottom ghost row grid(1, id) = grid(dim+1 ,id) !Copy last real row to top ghost row endif end subroutine ghostRows_kernel
The ghostRows_kernel and ghostColumns_kernel device subroutines are used to fill in our ghost rows and columns in device global memory. Each thread is responsible for copying two integers.
attributes(global) subroutine GOL_kernel(dim, grid, newGrid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid, newGrid integer :: idx, idy, id, numNeighbors, cell !Get global thread IDs idx = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1 idy = (blockIdx%y-1) * blockDim%y + threadIdx%y + 1 if(idx < dim+2 .AND. idy < dim+2) then numNeighbors = grid(idy,idx+1) + grid(idy,idx-1) & !right & left + grid(idy+1,idx) + grid(idy-1,idx) & !upper & lower + grid(idy+1,idx+1) + grid(idy-1,idx-1) & !diagonals + grid(idy-1,idx+1) + grid(idy+1,idx-1) cell = grid(idy,idx) !Here we have explicitly all of the game rules if(cell == 1 .AND. numNeighbors < 2) then newGrid(idy,idx) = 0 elseif(cell == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then newGrid(idy,idx) = 1 elseif(cell == 1 .AND. numNeighbors > 3) then newGrid(idy,idx) = 0 elseif(cell == 0 .AND. numNeighbors == 3) then newGrid(idy,idx) = 1 else newGrid(idy,idx) = cell endif endif end subroutine GOL_kernel
The GOL kernel will preform the majority of our work. We use a two dimensional block and grid size to better fit the our problems geometry. Once the thread calculates its global id and tests that it is not out of bounds it will read one cell value from global memory, calculate the new value, and write the result back to global memory.
!Host game grid array integer,dimension(:,:),allocatable :: h_grid !Device game grid arrays integer,device,dimension(:,:),allocatable :: d_grid, d_newGrid
Currently CUDA Fortran does not support device pointers and so we change our array attribute from pointer to allocatable.
!Allocate grid on device allocate(d_grid(dim+2,dim+2)) allocate(d_newGrid(dim+2,dim+2))
Given a array with the device attribute memory will be allocated in the GPUs global memory space.
!Implicit copy of host vectors to device d_grid = h_grid
By setting the device array equal to a host array a host to device DMA copy is implicitly initiated.
!Set the number of threads per block and blocks per grid cpyBlockSize = dim3(blockDim, 1, 1) cpyColsGridSize = dim3(ceiling(real(dim)/real(cpyBlockSize%x)), 1, 1) cpyRowsGridSize = dim3(ceiling(real(dim+2)/real(cpyBlockSize%x)), 1, 1) blockSize = dim3(blockDim, blockDim, 1) gridSize = dim3(ceiling(real(dim)/real(blockSize%x)), & ceiling(real(dim)/real(blockSize%y)), 1)
With our block size specified we must calculate the smallest integer number of blocks necessary to completely cover our game board, this is also of the CUDA Fortran data type dim3.
!Main game loop do iter=1,maxIter !device pointers are not supported currently so manually switch grid/newGrid if(MOD(iter,2) == 1) then call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_grid) call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_grid) call GOL_kernel<<<gridSize,blockSize>>>(dim, d_grid, d_newGrid) else call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_newGrid) call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_newGrid) call GOL_kernel<<<gridSize,blockSize>>>(dim, d_newGrid, d_grid) endif enddo
Although we can not simply switch our device grid pointers at the end of each game iteration due to CUDA Fortran not support device pointers we may switch what arguments we call our kernels with. This achieves the same effect with a minimal increase in code size.
!Implicit copy of device vector back to host h_grid = d_grid
After our kernel has completed we can implicitly copy the grid array from the device back to the host.
GOL-Shared.cuf
In the first example each thread read nine values from global memory and wrote back one value. The inefficiency of this approach is that each value that is read from global memory is used not only in calculating its own new value but also by its eight closest neighbors in calculating their new value. Although some global memory access penalties have been alleviated in the latest generation of cards with the addition of an L2 cache you will find shared memory is still an incredibly important aspect of GPU programming. Shared memory provides a very fast memory space(comparable to register access) that is shared between threads of the same thread block. In the case where multiple threads in a particular thread block are accessing the same global memory item shared memory may be used to reduce the number of global memory calls. A typical use for shared memory is as follows:
- Have each thread read from global memory into shared memory
- Synchronize threads to ensure all threads in a given block have finished reading from global memory
- Have each thread preform calculations using shared memory values
- Have each thread write result from shared memory back to device global memory
For detailed coverage of shared memory please see the CUDA C Programming Guide and CUDA Fortran Programming Guide.
module kernel contains attributes(global) subroutine ghostCols_kernel(dim, grid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid integer :: id !We want id ?~H~H [2,dim+1] !Get Global thread ID id = (blockIdx%x-1) * blockDim%x + threadIdx%x + 1 if(id < dim+2) then grid(id, dim+2) = grid(id, 2) !Copy first real column to right most ghost column grid(id, 1) = grid(id, dim+1) !Copy last real column to left most ghost column endif end subroutine ghostCols_kernel attributes(global) subroutine ghostRows_kernel(dim, grid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid integer :: id !We want id ?~H~H [1,dim+2] !Get Global thread ID id = (blockIdx%x-1) * blockDim%x + threadIdx%x if(id <= dim+2) then grid(dim+2, id) = grid(2, id) !Copy first real row to bottom ghost row grid(1, id) = grid(dim+1 ,id) !Copy last real row to top ghost row endif end subroutine ghostRows_kernel attributes(global) subroutine GOL_kernel(dim, grid, newGrid) integer,value :: dim integer,device,dimension(dim+2,dim+2) :: grid, newGrid integer :: idx, idy, id, i, j, numNeighbors, cell !Declare the shared memory on a per block level integer,shared :: s_grid(16, 16) !Get global thread IDs idx = (blockIdx%x-1) * (blockDim%x-2) + threadIdx%x idy = (blockIdx%y-1) * (blockDim%y-2) + threadIdx%y !Get local thread IDs i = threadIdx%y j = threadIdx%x !Copy cells into shared memory if (idx <= dim+2 .AND. idy <= dim+2) then s_grid(i,j) = grid(idy,idx) endif !Sync all threads in block call syncthreads() if (idx < dim+2 .AND. idy < dim+2) then if (i /= 1 .AND. i /=blockDim%y .AND. j /=1 .AND. j /=blockDim%x) then numNeighbors = s_grid(i,j+1) + s_grid(i,j-1) & !right & left + s_grid(i+1,j) + s_grid(i-1,j) & !upper & lower + s_grid(i+1,j+1) + s_grid(i-1,j-1) & !diagonals + s_grid(i-1,j+1) + s_grid(i+1,j-1) cell = s_grid(i,j) !Here we have explicitly all of the game rules if(cell == 1 .AND. numNeighbors < 2) then newGrid(idy,idx) = 0 elseif(cell == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then newGrid(idy,idx) = 1 elseif(cell == 1 .AND. numNeighbors > 3) then newGrid(idy,idx) = 0 elseif(cell == 0 .AND. numNeighbors == 3) then newGrid(idy,idx) = 1 else newGrid(idy,idx) = cell endif endif endif end subroutine GOL_kernel end module kernel program main use cudafor use kernel implicit none integer :: i,j,iter,seed(8),numNeighbors,total real :: randm !Linear game grid dimension integer :: dim = 1024 !Number of game iterations integer :: maxIter = 2**10 integer :: blockDim = 16 type(dim3) :: cpyBlockSize, cpyColsGridSize, cpyRowsGridSize, blockSize, gridSize !Host game grid array integer,dimension(:,:),allocatable :: h_grid !Device game grid array integer,device,dimension(:,:),allocatable :: d_grid, d_newGrid !Allocate square grid of (dim+2)^2 elements on host, 2 added for ghost cells allocate(h_grid(dim+2,dim+2)) !Allocate grid on device allocate(d_grid(dim+2,dim+2)) allocate(d_newGrid(dim+2,dim+2)) !Assign initial population randomly seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/) call random_seed(PUT=seed) do j=1,dim do i=1,dim call random_number(randm) h_grid(i,j) = nint(randm) enddo enddo !Implicit copy of host vectors to device d_grid = h_grid !Set the number of threads per block and blocks per grid cpyBlockSize = dim3(blockDim, 1, 1) cpyColsGridSize = dim3(ceiling(real(dim)/real(cpyBlockSize%x)), 1, 1) cpyRowsGridSize = dim3(ceiling(real(dim+2)/real(cpyBlockSize%x)), 1, 1) blockSize = dim3(blockDim-2, blockDim-2, 1) gridSize = dim3(ceiling(real(dim)/real(blockSize%x-2)), & ceiling(real(dim)/real(blockSize%y-2)), 1) !Main game loop do iter=1,maxIter !device pointers are not supported currently, so manually switch grid/newGrid if(MOD(iter,2) == 1) then call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_grid) call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_grid) call GOL_kernel<<<gridSize,blockSize>>>(dim, d_grid, d_newGrid) else call ghostCols_kernel<<<cpyColsGridSize, cpyBlockSize>>>(dim, d_newGrid) call ghostRows_kernel<<<cpyRowsGridSize, cpyBlockSize>>>(dim, d_newGrid) call GOL_kernel<<<gridSize,blockSize>>>(dim, d_newGrid, d_grid) endif enddo !Implicit copy of device vector back to host h_grid = d_grid !Sum up alive cells and print results total = 0 do j=2,dim+1 do i=2,dim+1 total = total + h_grid(i,j) enddo enddo print *, "Total Alive", total !Release memory deallocate(h_grid) deallocate(d_grid) deallocate(d_newGrid) end program
Changes to GOL-Shared.cuf
!Declare the shared memory on a per block level integer,shared :: s_grid(16, 16)
The shared attribute is used in the device kernel to allocate shared memory arrays. The specified size is on a per block basis
!Get global thread IDs idx = (blockIdx%x-1) * (blockDim%x-2) + threadIdx%x idy = (blockIdx%y-1) * (blockDim%y-2) + threadIdx%y
Extra care has to be taken when determining our global thread id for the game kernel. Each thread block will contain two ghost rows and two ghost columns which must be accounted for to calculate their global indexesidx and idy.
!Get local thread IDs i = threadIdx%y j = threadIdx%x
We define some convenience variables for the block thread indices.
!Copy cells into shared memory if (idx <= dim+2 .AND. idy <= dim+2) then s_grid(i,j) = grid(idy,idx) endif
Each thread that is within the bounds of our grid, including ghost cells, will read one value from global memory into shared memory.
!Sync all threads in block call syncthreads()
Before we use any shared memory we need to ensure that each thread in the block has finished reading its value from global memory into shared memory by calling syncThreads, this is a thread block level barrier. When all threads in a given block have reached the barrier execution will commence.
if (idx < dim+2 .AND. idy < dim+2) then if (i /= 1 .AND. i /=blockDim%y .AND. j /=1 .AND. j /=blockDim%x) then
Before calculating our new values we must check that our global thread id’s, iy and ix, are do not extend into, or past, the ghost cells. Local thread id’s, i and j, must be checked so that no calculations are preformed in the per block ghost cells.
numNeighbors = s_grid(i,j+1) + s_grid(i,j-1) & !right & left + s_grid(i+1,j) + s_grid(i-1,j) & !upper & lower + s_grid(i+1,j+1) + s_grid(i-1,j-1) & !diagonals + s_grid(i-1,j+1) + s_grid(i+1,j-1) cell = s_grid(i,j)
The standard GOL calculations are preformed only now we are reading all cell values from shared memory instead of global memory.
blockSize = dim3(blockDim-2, blockDim-2, 1) gridSize = dim3(ceiling(real(dim)/real(blockSize%x-2)), & ceiling(real(dim)/real(blockSize%y-2)), 1)
Slight modification is needed to set the correct block and grid size since we are now using two independent dimensions.