This sample shows the Game of Life CPU code converted to an OpenACC accelerator directive version. The full source can be viewed or download at the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

GOL.c

#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985

#define dim 1024 // Linear game grid dimension excluding ghost cells
#define idx(i,j) ((i)*(dim+2)+(j))

// Add up all neighbors
inline int getNeighbors(int *grid, int i, int j)
{
    int num = grid[idx(i+1,j)] + grid[idx(i-1,j)]     //upper lower
            + grid[idx(i,j+1)] + grid[idx(i,j-1)]     //right left
            + grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)] //diagonals
            + grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];

    return num;
}

int main(int argc, char* argv[])
{
    int i,j,iter;
    // Number of game iterations
    int maxIter = 1<<10;

    // Total number of alive cells
    int total = 0;

    // Total size of grid in bytes
    size_t bytes = sizeof(int)*(dim+2)*(dim+2);

    // Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    int *grid = (int*) malloc(bytes);

    // Allocate newGrid
    int *restrict newGrid = (int*) malloc(bytes);

    // Assign initial population randomly
    srand(SRAND_VALUE);
    for(i = 1; i <= dim; i++) {
        for(j = 1; j <= dim; j++) {
            grid[idx(i,j)] = rand() % 2;
        }
    }

  int fullSize = (dim+2)*(dim+2);
  #pragma acc data copyin(grid[0:fullSize]) create(newGrid[0:fullSize])
  {
    // Main game loop
    for (iter = 0; iter<maxIter; iter++) {
    #pragma acc kernels
    {
        // Left-Right columns
        #pragma acc loop independent
        for (i = 1; i <= dim; i++) {
            grid[idx(i,0)] = grid[idx(i,dim)];   //Copy first real column to right ghost column
            grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
        }
        // Top-Bottom rows
        #pragma acc loop independent
        for (j = 0; j <= dim+1; j++) {
            grid[idx(0,j)] = grid[idx(dim,j)];   //Copy first real row to bottom ghost row
            grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
        }

        // Now we loop over all cells and determine their fate
        #pragma acc loop independent
        for (i = 1; i <= dim; i++) {
            for (j = 1; j <= dim; j++) {
               // Get the number of neighbors for a given grid point
                int numNeighbors = getNeighbors(grid,i,j);

                // Here we have explicitly all of the game rules
                if (grid[idx(i,j)] == 1 && numNeighbors < 2)
                    newGrid[idx(i,j)] = 0;
                else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
                    newGrid[idx(i,j)] = 1;
                else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
                    newGrid[idx(i,j)] = 0;
                else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
                    newGrid[idx(i,j)] = 1;
                else
                    newGrid[idx(i,j)] = grid[idx(i,j)];
            }
        }

       // Can't switch pointers so we mannually have to copy array over
       #pragma acc loop independent
       for(i = 1; i <= dim; i++) {
           for(j = 1; j <= dim; j++) {
               grid[idx(i,j)] = newGrid[idx(i,j)];
           }
       }
    } // End ACC kernels region
    } // End main game loop

    // Sum up alive cells
    #pragma acc parallel
    {
        #pragma acc loop reduction(+:total)
        for (i = 1; i <= dim; i++) {
            #pragma acc loop reduction(+:total)
            for (j = 1; j <= dim; j++) {
                total += grid[idx(i,j)];
            }
        }
    }
  } // End ACC Data region

    // Print number of live cells
    printf("Total Alive: %d\n", total);

    // Release memory
    free(grid);
    free(newGrid);

    return 0;
}

Changes to GOL.c

#define dim 1024 // Linear game grid dimension excluding ghost cells
#define idx(i,j) ((i)*(dim+2)+(j))

At the time of this writing, multidimensional C arrays are not fully supported in all OpenACC implementations. A convenience macro is defined and used whenever grid or newGrid is accessed. These changes have not all been highlighted. grid[i][j] will now be accessed as grid[idx(i,j)]

inline int getNeighbors(int *grid, int i, int j)

OpenACC 1.0 does not support function calls within acc regions, all function calls must be inlined. The inline qualifier can be used as a hint to the compiler we wish to have the function inlined.

// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
int *grid = (int*) malloc(bytes);

// Allocate newGrid
int *restrict newGrid = (int*) malloc(bytes);

The grid and newGrid arrays are now linear. As discussed in the vector addition example we use the restrict qualifier to help the compiler resolve loop dependencies.

#pragma acc data copyin(grid[0:fullSize]) create(newGrid[0:fullSize])
{
}

Memory transfers between host and device can be very costly and so optimizing memory transfers in any way possible is highly recommended. Here an ACC data region starts on line 49 and ends at line 101. grid will be copied to the accelerator at the start of the region. newGrid will be allocated on the device but no copies from host to device or device to host are needed.

#pragma acc kernels
{
}

An acc kernels region will attempt to parallelize any enclosed loops across the accelerator. The kernels directive gives the compiler complete control, if not guided by the programmer, in deciding how to map the loops onto the target hardware. This clause does not guarantee the compiler will be able to parallelize the loop across the device, if the compiler is not able to determine the loop iterations are independent the loop may be run sequentially on the accelerator. It is imperative that compiler information be checked to ensure that the loops are parallelized.

// Left-Right columns
#pragma acc loop independent
// Top-Bottom rows
#pragma acc loop independent
// Now we loop over all cells and determine their fate
#pragma acc loop independent
// Can't switch pointers so we mannually have to copy array over
#pragma acc loop independent

The compiler is not always able to determine that loop iterations can be done independently, when this is encountered the loop will be scheduled sequentially on the accelerator. The loop independent clause lets the compiler know that it is safe to parallelize the immediately following loop.

// Can't switch pointers so we mannually have to copy array over
#pragma acc loop independent
for(i = 1; i <= dim; i++) {
   for(j = 1; j <= dim; j++) {
       grid[idx(i,j)] = newGrid[idx(i,j)];
   }
}

Device pointer manipulation is not currently supported by OpenACC so the pointer swap must be removed. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done.

// Sum up alive cells
#pragma acc parallel
{
    #pragma acc loop reduction(+:total)
    for (i = 1; i <= dim; i++) {
        #pragma acc loop reduction(+:total)
        for (j = 1; j <= dim; j++) {
            total += grid[idx(i,j)];
        }
    }
}

At the start of this acc parallel region a number of compiler determined gangs will be launched on the GPU. The acc loop directive indicates that the immediately following loop will be parallelized across the accelerator. The reduction clause on both loops indicate that a reduction is required on the scalar variable total.

Compiling GOL.c

PGI

$ module load cudatoolkit
$ cc -acc gol.c -Minline -Minfo -ta=nvidia:kepler

CRAY
Cray will automatically attempt to inline the getNeighbors function

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ cc -hpragma=acc,msgs gol.c

Compiling GOL.c with CapsMC

If using the GNU programming environment with capsmc to properly handle the restrict qualifier the compiler flag -std=c99 will need to be added to cc.
If using the Intel programming environment with capsmc to properly handle the restrict qualifier the compiler flag -restrict will need to be added to cc.
CapsMC can be used in conjunction with any of the programming environments available on Titan

CapsMC will automatically attempt to inline the getNeighbors function

$ module load capsmc
$ module load cudatoolkit
$ cc gol.c

GOL.f90

program main
    implicit none
    integer :: i,j,iter,seed(8),numNeighbors,total
    real :: randm
    ! Linear game grid dimension
    integer :: dim = 1024
    ! Number of game iterations
    integer :: maxIter = 2**10

    ! Game grid pointers
    integer,dimension(:,:),allocatable :: grid, newGrid
 
    ! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
    allocate(grid(dim+2,dim+2))
    allocate(newGrid(dim+2,dim+2))

    ! Assign initial population randomly
    seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
    call random_seed(PUT=seed)
    do j=1,dim
        do i=1,dim
            call random_number(randm)
            grid(i,j) = nint(randm)
        enddo
    enddo

    !$acc data copyin(grid), create(newGrid)

    ! Main game loop
    do iter=1,maxIter

    !$acc kernels

        ! Top-Bottom ghost rows
        !$acc loop independent
        do j=2,dim+1
            grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
            grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
        enddo

        ! Left-Right ghost columns
        !$acc loop independent
        do i=1,dim+2
            grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
            grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
        enddo

        ! Now we loop over all cells and determine their fate
        do j=2,dim+1
            do i=2,dim+1
                ! Get the number of neighbors for a given grid point
                numNeighbors = grid(i,j+1) + grid(i,j-1)&     !left and right
                             + grid(i+1,j) + grid(i-1,j)&     !upper and lower
                             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
                             + grid(i-1,j+1) + grid(i+1,j-1)

                ! Here we have explicitly all of the game rules
                if(grid(i,j) == 1 .AND. numNeighbors < 2) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
                    newGrid(i,j) = 1
                elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
                    newGrid(i,j) = 0
                elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
                    newGrid(i,j) = 1
                else
                    newGrid(i,j) = grid(i,j)
                endif
            enddo
        enddo

        ! Can't switch pointers so we mannually have to copy array over
        do j=2,dim+1
            do i=2,dim+1
                grid(i,j) = newGrid(i,j)
            enddo
        enddo

    !$acc end kernels
    enddo!end main game loop  

    total = 0
    ! Sum up alive cells and print results
    !$acc parallel
    !$acc loop reduction(+:total)
    do j=2,dim+1
        !$acc loop reduction(+:total)
        do i=2,dim+1
            total = total + grid(i,j)
        enddo
    enddo
    !$acc end parallel

    !$acc end data

    print *, "Total Alive", total

end program

Changes to GOL.f90

!$acc data copyin(grid), create(newGrid)
!$acc end data

Memory transfers between host and device can be very costly and so optimizing memory transfers in any way possible is highly recommended. Here an ACC data region starts on line 27 and ends at line 90. grid will be copied to the accelerator at the start of the region. newGrid will be allocated on the device but no copies from host to device or device to host are needed.

!$acc kernels
!$acc end kernels

An acc kernels region will attempt to parallelize any enclosed loops across the accelerator. The kernels directive gives the compiler complete control, if not guided by the programmer, in deciding how to map the loops onto the target hardware. This clause does not guarantee the compiler will be able to parallelize the loop across the device, if the compiler is not able to determine the loop iterations are independent the loop may be run sequentially on the accelerator. It is imperative that compiler information be checked to ensure that the loops are parallelized.

!$acc loop independent
!$acc loop independent

The compiler is not always able to determine that loop iterations can be done independently, when this is encountered the loop will be scheduled sequentially on the accelerator. The loop independent clause lets the compiler know that it is safe to parallelize the immediately following loop.

numNeighbors = grid(i,j+1) + grid(i,j-1)&     !left and right
             + grid(i+1,j) + grid(i-1,j)&     !upper and lower
             + grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
             + grid(i-1,j+1) + grid(i+1,j-1)

OpenACC 1.0 does not support function calls within acc regions, all function calls must be inlined. For this example, using the current set of compilers, it was best to inline the neighbors function by hand.

! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
    do i=2,dim+1
        grid(i,j) = newGrid(i,j)
    enddo
enddo

Device pointer manipulation is not currently supported by OpenACC so the pointer swap must be removed. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done.

! Sum up alive cells and print results
!$acc parallel
!$acc loop reduction(+:total)
do j=2,dim+1
    !$acc loop reduction(+:total)
    do i=2,dim+1
        total = total + grid(i,j)
    enddo 
enddo
!$acc end parallel

At the start of this acc parallel region a number of compiler determined gangs will be launched on the GPU. The acc loop directive indicates that the immediately following loop will be parallelized across the accelerator. The reduction clause on both loops indicate that a reduction is required on the scalar variable total.

Compiling GOL.f90

OpenACC 1.0 does not support function calls within acc regions, all function calls must be inlined using the appropriate compiler flag.
PGI

$ module load cudatoolkit
$ ftn -acc gol.f90 -Minline -Minfo -ta=nvidia:kepler

CRAY

Cray will automatically try to inline the getNeighbors function

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ ftn -hacc,msgs gol.f90

Compiling GOL.f90 with CapsMC

CapsMC can be used in conjunction with any of the programming environments available on Titan

CapsMC will automatically try to inline the getNeighbors function

$ module load capsmc
$ module load cudatoolkit
$ ftn gol.f90