This sample shows the Game of Life CPU code converted to an HMPP accelerator directive version. Both codelet and region C and Fortran methods are presented. A second version is presented that fixes a performance issue incurred in the initial version. Please direct any questions or comments to help@nccs.gov

This tutorial covers CAPS HMPP Accelerator directives, If you are interested in CAPS OpenACC support please see the OpenACC Vector Addition Tutorial.

## GOL-codelet.c

```#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985

#pragma hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
void GOL(int dim, int grid[dim+2][dim+2], int newGrid[dim+2][dim+2], int maxIter)
{
int i, j, iter;

// Main game loop
for (iter = 0; iter<maxIter; iter++) {

// Left-Right columns
for (i = 1; i<=dim; i++) {
grid[i][0] = grid[i][dim]; //Copy first real column to right ghost column
grid[i][dim+1] = grid[i][1]; //Copy last real column to left ghost column
}

// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
grid[0][j] = grid[dim][j]; //Copy first real row to bottom ghost row
grid[dim+1][j] = grid[1][j]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = grid[i+1][j] + grid[i-1][j]      //upper lower
+ grid[i][j+1] + grid[i][j-1]      //right left
+ grid[i+1][j+1] + grid[i-1][j-1]  //diagonals
+ grid[i-1][j+1] + grid[i+1][j-1];

// Here we have explicitly all of the game rules
if (grid[i][j] == 1 && numNeighbors < 2)
newGrid[i][j] = 0;
else if (grid[i][j] == 1 && (numNeighbors == 2 || numNeighbors == 3))
newGrid[i][j] = 1;
else if (grid[i][j] == 1 && numNeighbors > 3)
newGrid[i][j] = 0;
else if (grid[i][j] == 0 && numNeighbors == 3)
newGrid[i][j] = 1;
else
newGrid[i][j] = grid[i][j];
}
}

// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
for(j = 1; j<dim+2; j++) {
grid[i][j] = newGrid[i][j];
}
}
}

}

int main(int argc, char* argv[])
{
int i,j,iter,numNeighbors;
int dim = 1024;
// Number of game iterations
int maxIter = 1<<15;

// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

srand(SRAND_VALUE);
for(i = 1; i<=dim; i++) {
for(j = 1; j<=dim; j++) {
grid[i*(dim+2)+j] = rand() % 2;
}
}

// Call the GPU to execute codelet
#pragma hmpp GOL callsite
GOL(dim, grid, newGrid, maxIter);

// Sum up alive cells and print results
int total = 0;
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
total += grid[i*(dim+2)+j];
}
}

printf("Total Alive: %d\n", total);

free(grid);
free(newGrid);

return 0;
}
```

Changes
Lines 6-58

```#pragma hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
void GOL(int dim, int grid[dim+2][dim+2], int newGrid[dim+2][dim+2], int maxIter)
{
int i, j, iter;

// Main game loop
for (iter = 0; iter<maxIter; iter++) {

// Left-Right columns
for (i = 1; i<=dim; i++) {
grid[i][0] = grid[i][dim]; //Copy first real column to right ghost column
grid[i][dim+1] = grid[i][1]; //Copy last real column to left ghost column
}

// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
grid[0][j] = grid[dim][j]; //Copy first real row to bottom ghost row
grid[dim+1][j] = grid[1][j]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = grid[i+1][j] + grid[i-1][j]      //upper lower
+ grid[i][j+1] + grid[i][j-1]      //right left
+ grid[i+1][j+1] + grid[i-1][j-1]  //diagonals
+ grid[i-1][j+1] + grid[i+1][j-1];

// Here we have explicitly all of the game rules
if (grid[i][j] == 1 && numNeighbors < 2)
newGrid[i][j] = 0;
else if (grid[i][j] == 1 && (numNeighbors == 2 || numNeighbors == 3))
newGrid[i][j] = 1;
else if (grid[i][j] == 1 && numNeighbors > 3)
newGrid[i][j] = 0;
else if (grid[i][j] == 0 && numNeighbors == 3)
newGrid[i][j] = 1;
else
newGrid[i][j] = grid[i][j];
}
}

// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
for(j = 1; j<dim+2; j++) {
grid[i][j] = newGrid[i][j];
}
}
}

}
```

To use the codelet method all of the game logic was moved into its own function. With the directive option args[grid].io=inout we specify that grid is to be copied onto the GPU at the start of the codelet and copied back to the host at the end. newGrid is specified to have no transfer between the host and device. The main game loop is included inside of the codelet even though it is neccesarily serial to ensure that the game grids are not coppied to and from the host after each game iteration. We will look at a more explicit way to manage data transfer below. grid and newGrid are passed into our codelet with their full dimension stated, this is neccesary for HMPP to correctly copy the data over.

```// Get the number of neighbors for a given grid point
numNeighbors = grid[i+1][j] + grid[i-1][j]          //upper lower
+ grid[i][j+1] + grid[i][j-1]      //right left
+ grid[i+1][j+1] + grid[i-1][j-1]  //diagonals
+ grid[i-1][j+1] + grid[i+1][j-1];
```

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our function to calculate the number of alive neighbors a cell has.

```// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
for(j = 1; j<dim+2; j++) {
grid[i][j] = newGrid[i][j];
}
}
```

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

```// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cell
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));
```
```grid[i*(dim+2)+j] = rand() % 2;
```
```total += grid[i*(dim+2)+j];
```

HMPP seems to currently be limited to copying single dimensional arrays from host the device. We are able to maintain our multidimensional array indexing in the codelet while on the host we must allocate and address linear arrays.

## GOL-region.c

```#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985
#define idx(i,j) ((i)*(dim+2)+(j))

int main(int argc, char* argv[])
{
int i,j,numNeighbors;
int dim = 1024;
// Number of game iterations
int maxIter = 1<<15;

// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

srand(SRAND_VALUE);
for(i = 1; i<=dim; i++) {
for(j = 1; j<=dim; j++) {
grid[i*(dim+2)+j] = rand() % 2;
}
}

#pragma hmpp GOL region, target=CUDA, args[*].transfer=atcall, args[grid].io=inout &
#pragma hmpp & args[newGrid].io=none
{
int i,j,iter;
// Main game loop
for (iter = 0; iter<maxIter; iter++) {

#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
grid[idx(i,0)] = grid[idx(i,dim)]; //Copy first real column to right ghost column
grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
}

#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
grid[idx(0,j)] = grid[idx(dim,j)]; //Copy first real row to bottom ghost row
grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]      //upper lower
+ grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
+ grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
+ grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];

// Here we have explicitly all of the game rules
if (grid[idx(i,j)] == 1 && numNeighbors < 2)
newGrid[idx(i,j)] = 0;
else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
newGrid[idx(i,j)] = 1;
else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
newGrid[idx(i,j)] = 0;
else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
newGrid[idx(i,j)] = 1;
else
newGrid[idx(i,j)] = grid[idx(i,j)];
}
}

#pragma hmppcg gridify
// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
for(j = 1; j<dim+2; j++) {
grid[idx(i,j)] = newGrid[idx(i,j)];
}
}
}

}
// Sum up alive cells and print results
int total = 0;
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
total += grid[idx(i,j)];
}
}

printf("Total Alive: %d\n", total);

free(grid);
free(newGrid);

return 0;
}
```

Changes

```#define idx(i,j) ((i)*(dim+2)+(j))
```

In the codelet version we were able to stretch the C language a bit and use multidimensional arrays in our codelet and a single dimensional array on our host, this is not the case with the region method. A convenience macro is defined and used whenever grid or newGrid is accessed.

```// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));
```

As in the codelet version HMPP does not currently handle multidimensional arrays properly and so we revert back to a single dimensional array.

```#pragma hmpp GOL region, target=CUDA, args[*].transfer=atcall, args[grid].io=inout &
#pragma hmpp & args[newGrid.io]=none
```

We add the directive option args[grid].io=inout we specify that grid is to be copied onto the GPU at the start of the region and copied back to the host at the end. newGrid is specified to have no transfer between the host and device. The main game loop is included inside of the region even though it is neccesarily serial to ensure that the game grids are not coppied to and from the host after each game iteration. We will look at a more explicit way to manage data transfer below.

``` int i,j,iter;
```

At the time of this writing HMPP does not correctly identify the induction variables used in hmpp region that are declared outside of the region. The solution is to simply define our induction variables inside of the region.

```#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
```
```#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
```
```// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
```
```#pragma hmppcg gridify
// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
for(j = 1; j<dim+2; j++) {
```

The hmppcg directives are used to guide the compiler at the loop level. In this particular case the compiler did not want to parallelize the loops due to the outermost game loop being sequential. Adding hmppcg gridify tells the compiler the loop immediately following the directive should be executed in parallel on the GPU.

```// Get the number of neighbors for a given grid point
numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]          //upper lower
+ grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
+ grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
+ grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];
```

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our function to calculate the number of alive neighbors a cell has.

```#pragma hmppcg gridify
// Can't switch pointers so we mannually have to copy array over
for(i = 1; i<dim+2; i++) {
for(j = 1; j<dim+2; j++) {
grid[idx(i,j)] = newGrid[idx(i,j)];
}
}
```

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

Compiling
We add the –io-report option to hmpp to see compile time host/device memory transfer information.

```\$ hmpp --io-report cc GOL-codelet.c -o gol.out
```

or

```\$ hmpp --io-report cc GOL-region.c -o gol.out
```

Output:
The output will be similar for both the codelet and region methods.

```hmpp: [Info] IO report :
In GROUP 'gol'
REGION 'gol' at GOL-region.c:29, function '__hmpp_region__gol'
Parameter  'dim' has intent IN
Parameter  'maxIter' has intent IN
Parameter  'grid' has intent INOUT
Parameter  'newGrid' has intent INOUT
```

Here we see the intent, the transfer direction with respect to the GPU, of each of our variables. We will see more explicit methods of dealing with data transfer below.

```hmpp: [Info] Generated codelet filename is "gol_cuda.cu".
hmppcg: [Message DPL3000] GOLHMPP-TRY0-REGION.c:37: Loop 'i' was gridified (1D)
hmppcg: [Message DPL3000] GOLHMPP-TRY0-REGION.c:44: Loop 'j' was gridified (1D)
hmppcg: [Message DPL3001] GOLHMPP-TRY0-REGION.c:52: Loops 'j' and 'i' were gridified (2D)
hmppcg: [Message DPL3000] GOLHMPP-TRY0-REGION.c:77: Loop 'i' was gridified (1D)
hmppcg: [Warning DPL3002] GOLHMPP-TRY0-REGION.c:33: Loop not gridified: will be executed on CPU
```

All of the loops have been gridified indicating a kernel has been generated for the given loop that will be executed in parallel on the GPU. A warming message tells us that the main game loop has not been gridified and will be executed serially on the CPU, this is the expected and correct behavior.

## GOL-codelet.f90

```!\$hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
subroutine GOL(dim, grid, newGrid, maxIter)
implicit none
integer, intent(in) :: dim, maxIter
integer, intent(inout) :: grid(dim+2,dim+2), newGrid(dim+2,dim+2)
integer :: i,j,iter,numNeighbors

! Main game loop
do iter=1,maxIter
! Top-Bottom ghost rows
do j=2,dim+1
grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)& !left & right
+ grid(i+1,j) + grid(i-1,j)& !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)

! Here we have explicitly all of the game rules
if(grid(i,j) == 1 .AND. numNeighbors < 2) then
newGrid(i,j) = 0
elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
newGrid(i,j) = 1
elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
newGrid(i,j) = 0
elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
newGrid(i,j) = 1
else
newGrid(i,j) = grid(i,j)
endif
enddo
enddo

! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
do i=2,dim+1
grid(i,j) = newGrid(i,j)
enddo
enddo

enddo!end main game loop

end subroutine GOL

program main
implicit none

integer :: i,j,seed(8),numNeighbors,total
real :: randm
! Linear game grid dimension
integer :: dim = 1024
! Number of game iterations
integer :: maxIter = 2**10

! Game grid pointers
integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
allocate(grid(dim+2,dim+2))
allocate(newGrid(dim+2,dim+2))

seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
call random_seed(PUT=seed)
do j=1,dim
do i=1,dim
call random_number(randm)
grid(i,j) = nint(randm)
enddo
enddo

!\$hmpp GOL callsite
call GOL(dim, grid, newGrid, maxIter)

! Sum up alive cells and print results
total = 0
do j=2,dim+1
do i=2,dim+1
total = total + grid(i,j)
enddo
enddo
print *, "Total Alive", total

! Release memory
deallocate(grid)
deallocate(newGrid)

end program
```

Changes
Lines 1-55:

```!\$hmpp GOL codelet, target=CUDA, args[grid].io=inout, args[*].transfer=atcall, args[newGrid].io=none
subroutine GOL(dim, grid, newGrid, maxIter)
implicit none
integer, intent(in) :: dim, maxIter
integer, intent(inout) :: grid(dim+2,dim+2), newGrid(dim+2,dim+2)
integer :: i,j,iter,numNeighbors

! Main game loop
do iter=1,maxITer
! Top-Bottom ghost rows
do j=2,dim+1
grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)& !left & right
+ grid(i+1,j) + grid(i-1,j)& !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)

! Here we have explicitly all of the game rules
if(grid(i,j) == 1 .AND. numNeighbors < 2) then
newGrid(i,j) = 0
elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
newGrid(i,j) = 1
elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
newGrid(i,j) = 0
elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
newGrid(i,j) = 1
else
newGrid(i,j) = grid(i,j)
endif
enddo
enddo

! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
do i=2,dim+1
grid(i,j) = newGrid(i,j)
enddo
enddo

enddo!end main game loop

end subroutine GOL
```

To use the codelet method all of the game logic was moved into its own subroutine. With the directive option args[grid].io=inout we specify that grid is to be copied onto the GPU at the start of the codelet and copied back to the host at the end. newGrid is specified to have no transfer between the host and device. The main game loop is included inside of the codelet even though it is neccesarily serial to ensure that the game grids are not copied to and from the host after each game iteration. We will look at a more explicit way to manage data transfer below.

```! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)& !left & right
+ grid(i+1,j) + grid(i-1,j)& !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)
```

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our subroutine to calculate the number of alive neighbors a cell has.

```! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
do i=2,dim+1
grid(i,j) = newGrid(i,j)
enddo
enddo
```

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

## GOL-region.f90

```program main
implicit none

integer :: i,j,iter,seed(8),numNeighbors,total
real :: randm
! Linear game grid dimension
integer :: dim = 1024
! Number of game iterations
integer :: maxIter = 2**10

! Game grid pointers
integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
allocate(grid(dim+2,dim+2))
allocate(newGrid(dim+2,dim+2))

seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
call random_seed(PUT=seed)
do j=1,dim
do i=1,dim
call random_number(randm)
grid(i,j) = nint(randm)
enddo
enddo

!\$hmpp GOL region, target=CUDA, args[grid].io=inout, args[*].transfer=atcall &
!\$hmpp & args[newGrid].io=none

! Main game loop
do iter=1,maxITer
! Top-Bottom ghost rows
do j=2,dim+1
grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j, i)
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)&    !left & right
+ grid(i+1,j) + grid(i-1,j)&    !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)

!Here we have explicitly all of the game rules
if(grid(i,j) == 1 .AND. numNeighbors < 2) then
newGrid(i,j) = 0
elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
newGrid(i,j) = 1
elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
newGrid(i,j) = 0
elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
newGrid(i,j) = 1
else
newGrid(i,j) = grid(i,j)
endif
enddo
enddo

! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
do i=2,dim+1
grid(i,j) = newGrid(i,j)
enddo
enddo

enddo!end main game loop

!\$hmpp GOL endregion

! Sum up alive cells and print results
total = 0
do j=2,dim+1
do i=2,dim+1
total = total + grid(i,j)
enddo
enddo
print *, "Total Alive", total

! Release memory
deallocate(grid)
deallocate(newGrid)

end program
```

Changes

```! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j, i)
do j=2,dim+1
do i=2,dim+1
```

By default the hmpp compiler did not want to gridify the loop over each cell. We can attempt to force the compiler to do so adding the hmppcg directive with gridify(j, i) option which specifies we want to gridify with respect to the induction variables j and i.

```! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)&    !left & right
+ grid(i+1,j) + grid(i-1,j)&    !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)
```

In the CUDA programming model all functions must be effectively inlined. Here we manually inline our subroutine to calculate the number of alive neighbors a cell has.

```! Can't switch pointers so we mannually have to copy array over
do j=2,dim+1
do i=2,dim+1
grid(i,j) = newGrid(i,j)
enddo
enddo
```

This modification is very important, device pointers are not accessible inside of the acc region. If you leave the pointer swap in place from the cpu code the compiler will not complain, the code will run, and you will get incorrect results. This is not a limitation of GPU programming as you will see in the CUDA GOL example but of the HMPP directive framework. The easiest way to fix this is to replace the pointer swap with a copy operation as we have done. A possible workaround for this operation is investigated below.

Compiling
We add the –io-report option to hmpp to see compile time host/device memory transfer information.

```\$ hmpp --io-report ftn GOL-codelet.f90 -o gol.out
```

or

```\$ hmpp --io-report ftn GOL-region.f90 -o gol.out
```

Output:
The output will be similar for both the codelet and region methods. In the most recent release as of this writing you may get some warning messages from the compiler about unused variables when using the region method.

```hmpp: [Info] IO report :
In GROUP 'gol'
CODELET 'gol' at GOLHMPP-TRY0-CODELET.f90:1, function 'gol'
Parameter 'dim' has intent IN
Parameter 'grid' has intent INOUT
Parameter 'newgrid' has intent INOUT
Parameter 'maxiter' has intent IN
```

Here we see the intent, the transfer direction with respect to the GPU, of each of our variables. We will see more explicit methods of dealing with data transfer below.

```hmpp: [Info] Generated codelet filename is "gol_cuda.cu".
hmppcg: [Message DPL3000] GOLHMPP-TRY0-CODELET.f90:11: Loop 'j' was gridified (1D)
hmppcg: [Message DPL3000] GOLHMPP-TRY0-CODELET.f90:17: Loop 'i' was gridified (1D)
hmppcg: [Message DPL3001] GOLHMPP-TRY0-CODELET.f90:23: Loops 'i' and 'j' were gridified (2D)
hmppcg: [Message DPL3001] GOLHMPP-TRY0-CODELET.f90:47: Loops 'i' and 'j' were gridified (2D)
hmppcg: [Warning DPL3002] GOLHMPP-TRY0-CODELET.f90:9: Loop not gridified: will be executed on CPU
```

All of the loops have been gridified indicating a kernel has been generated for the given loop that will be executed in parallel on the GPU. A warming message tells us that the main game loop has not been gridified and will be executed serially on the CPU, this is the expected and correct behavior.

## GOL-codelet-Try2.c

In this version we will remove the memory copy kernel that we needed to use since HMPP accelerate does not currently suppoort device pointers. As an added bonus we will also take a look at more advanced memory management techniques. There are numerous ways this could be approached but the methods shown highlight many good to know HMPP features.

```#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985

#pragma hmpp <GOL> group target=CUDA
#pragma hmpp <GOL> mapbyname, dim
#pragma hmpp <GOL> map, args[even::grid;odd::newGrid]
#pragma hmpp <GOL> map, args[even::newGrid;odd::grid]
#pragma hmpp <GOL> even codelet, args[dim,grid,newGrid].transfer=manual
#pragma hmpp <GOL> odd  codelet, args[dim,grid,newGrid].transfer=manual
void GOL(int dim, int grid[dim+2][dim+2], int newGrid[dim+2][dim+2])
{
int i, j, numNeighbors;

// Left-Right columns
for (i = 1; i<=dim; i++) {
grid[i][0] = grid[i][dim]; //Copy first real column to right ghost column
grid[i][dim+1] = grid[i][1]; //Copy last real column to left ghost column
}
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
grid[0][j] = grid[dim][j]; //Copy first real row to bottom ghost row
grid[dim+1][j] = grid[1][j]; //Copy last real row to top ghost row
}

// Now we loop over all cells and determine their fate
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = grid[i+1][j] + grid[i-1][j]      //upper lower
+ grid[i][j+1] + grid[i][j-1]      //right left
+ grid[i+1][j+1] + grid[i-1][j-1]  //Diagonals
+ grid[i-1][j+1] + grid[i+1][j-1];

// Here we have explicitly all of the game rules
if (grid[i][j] == 1 && numNeighbors < 2)
newGrid[i][j] = 0;
else if (grid[i][j] == 1 && (numNeighbors == 2 || numNeighbors == 3))
newGrid[i][j] = 1;
else if (grid[i][j] == 1 && numNeighbors > 3)
newGrid[i][j] = 0;
else if (grid[i][j] == 0 && numNeighbors == 3)
newGrid[i][j] = 1;
else
newGrid[i][j] = grid[i][j];
}
}
}

int main(int argc, char* argv[])
{
int i,j,iter;
// Linear grid dimension
int dim = 1024;
// Number of game iterations
int maxIter = 1<<10;

#pragma hmpp <GOL> allocate
// Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

srand(SRAND_VALUE);
for(i = 1; i<=dim; i++) {
for(j = 1; j<=dim; j++) {
grid[i*(dim+2)+j] = rand() % 2;
}
}

// Copy memory tp GPU

// Main game loop
for (iter = 0; iter<maxIter; iter++) {
if(iter%2 == 0) {
#pragma hmpp <GOL> even callsite
GOL(dim, grid, newGrid);
}
else {
#pragma hmpp <GOL> odd callsite
GOL(dim, grid, newGrid);
}

}//iter loop

// Read back final configuration to confirm it worked
#pragma hmpp <GOL> delegatedstore, args[odd::newGrid]

// Sum up alive cells and print results
int total = 0;
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
total += newGrid[i*(dim+2)+j];
}
}

printf("Total Alive: %d\n", total);

#pragma hmpp <GOL> release
free(grid);
free(newGrid);

return 0;
}
```

Changes

```#pragma hmpp <GOL> group target=CUDA
#pragma hmpp <GOL> mapbyname, dim
#pragma hmpp <GOL> map, args[even::grid;odd::newGrid]
#pragma hmpp <GOL> map, args[even::newGrid;odd::grid]
#pragma hmpp <GOL> even codelet, args[dim,grid,newGrid].transfer=manual
#pragma hmpp <GOL> odd  codelet, args[dim,grid,newGrid].transfer=manual
```

A major change to our code has been to get rid of the newGrid to grid memory copy. Instead we use some of the built in HMPP directive functionality.

```#pragma hmpp <GOL> group target=CUDA
```

Here we create what is known as a codelet group, name GOL. This allows us to keep track of similar kernels and set common attributes.

```#pragma hmpp <GOL> mapbyname, dim
```

This line will map the variable dim across all kernels. This means once dim is allocated and copied to the device for one kernel it will be available for use by all kernels in the GOL group.

```#pragma hmpp <GOL> map, args[even::grid;odd::newGrid]
#pragma hmpp <GOL> map, args[even::newGrid;odd::grid]
```

The map directive is used to map arguments between different kernels. This will effectively switch our pointers for us. The notation args[CodeletName1::Argument1,CodeletName2::Argument2] is used such that Argument1 in CodeletName1 will refer to the same device memory as Argument2 in CodeletName2. We see in our case we will soon be defining two kernels, one named even and one named odd.

```#pragma hmpp <GOL> even codelet
#pragma hmpp <GOL> odd  codelet, args[newGrid].io=none
```

The GOL group will contain two codelets, even and odd. Our game will oscillate between calling these codelets in the main game loop. The only difference between these codelets is caused by the map directive used above which effectively switches the device pointers to grid and newGrid.

```#pragma hmpp <GOL> allocate
```

The allocate directive will allocate space on the GPU for all memory needed by the named group. Any memory objects shared by more than one codelet through the map or mapbyname directives will only be allocated once.

```// Copy memory tp GPU
```

Once we have allocated space on the GPU we can copy the results from the host to the GPU with the advancedload directive. Notice when referencing what arguments will be copied we specify one codelet names, either codelet name could be specified.

```// Main game loop
for (iter = 0; iter<maxIter; iter++) {
if(iter%2 == 0) {
#pragma hmpp <GOL> even callsite
GOL(dim, grid, newGrid);
}
else {
#pragma hmpp <GOL> odd callsite
GOL(dim, grid, newGrid);
}
}//iter loop
```

The main game loop induction variable is used to switch between calling our even codelet and our odd codelet. Although it looks as though the same GOL function is being called in both cases because of the map HMPP directive defined with the codelet we know the grid and newGrid GPU pointers will be switched.

```// Read back final configuration to confirm it worked
#pragma hmpp <GOL> delegatedstore, args[odd::newGrid]
```

After the kernel is finished we copy the grid from the device to the host.

```#pragma hmpp <GOL> release
```

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .

## GOL-region-Try2.c

```#include <stdio.h>
#include <stdlib.h>

#define SRAND_VALUE 1985
#define idx(i,j) ((i)*(dim+2)+(j))

int main(int argc, char* argv[])
{
int i,j,iter;
int dim = 1024;
//Number of game iterations
int maxIter = 1<<10;

#pragma hmpp GOL allocate, args[grid,newGrid].size={(dim+2)*(dim+2)}
// Allocate square grid of (dim+2)^2 elements, 2 added for perioid boundary condition ghost cells
int *grid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

// Allocate newGrid
int *newGrid = (int*) malloc( sizeof(int) * (dim+2) *(dim+2));

srand(SRAND_VALUE);
for(i = 1; i<=dim; i++) {
for(j = 1; j<=dim; j++) {
grid[i*(dim+2)+j] = rand() % 2;
}
}

// Copy memory tp GPU

// Main game loop
for (iter = 0; iter<maxIter; iter++) {
#pragma hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall
{
int i,j;
if(iter%2 == 0)
{
#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
grid[idx(i,0)] = grid[idx(i,dim)]; //Copy first real column to right ghost column
grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
}

#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
grid[idx(0,j)] = grid[idx(dim,j)]; //Copy first real row to bottom ghost row
grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]      //upper lower
+ grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
+ grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
+ grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];

// Here we have explicitly all of the game rules
if (grid[idx(i,j)] == 1 && numNeighbors < 2)
newGrid[idx(i,j)] = 0;
else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
newGrid[idx(i,j)] = 1;
else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
newGrid[idx(i,j)] = 0;
else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
newGrid[idx(i,j)] = 1;
else
newGrid[idx(i,j)] = grid[idx(i,j)];
}
}

} else {
#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
newGrid[idx(i,0)] = newGrid[idx(i,dim)]; //Copy first real column to right ghost column
newGrid[idx(i,dim+1)] = newGrid[idx(i,1)]; //Copy last real column to left ghost column
}

#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
newGrid[idx(0,j)] = newGrid[idx(dim,j)]; //Copy first real row to bottom ghost row
newGrid[idx(dim+1,j)] = newGrid[idx(1,j)]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = newGrid[idx(i+1,j)] + newGrid[idx(i-1,j)]      //upper lower
+ newGrid[idx(i,j+1)] + newGrid[idx(i,j-1)]      //right left
+ newGrid[idx(i+1,j+1)] + newGrid[idx(i-1,j-1)]  //diagonals
+ newGrid[idx(i-1,j+1)] + newGrid[idx(i+1,j-1)];

// Here we have explicitly all of the game rules
if (newGrid[idx(i,j)] == 1 && numNeighbors < 2)
grid[idx(i,j)] = 0;
else if (newGrid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
grid[idx(i,j)] = 1;
else if (newGrid[idx(i,j)] == 1 && numNeighbors > 3)
grid[idx(i,j)] = 0;
else if (newGrid[idx(i,j)] == 0 && numNeighbors == 3)
grid[idx(i,j)] = 1;
else
grid[idx(i,j)] = newGrid[idx(i,j)];
}
}

}
}
}//iter loop

// Read back final configuration to confirm it worked
#pragma hmpp GOL delegatedstore, args[grid]

// Sum up alive cells and print results
int total = 0;
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
total += grid[i*(dim+2)+j];
}
}

printf("Total Alive: %d\n", total);

#pragma hmpp GOL release
free(grid);
free(newGrid);

return 0;
}
```

Changes

```#pragma hmpp GOL allocate, args[grid,newGrid].size={(dim+2)*(dim+2)}
```

The hmpp GOL allocate directive will allocate space on the GPU for the named region. The compiler requires us to specify the size of the passed in arrays at allocation time.

```hmpp GOL advancedload, args[dim;iter;grid;newGrid]
```

hmpp GOL advancedload, args[dim;iter;grid;newGrid] will copy the named arguments of the region from the host to the GPU.

```#pragma hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall
```

In this version of the code we have our region inside of our main game loop but no longer have to worry about unnecessary memory copies.

```           if(iter%2 == 0)
{
#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
grid[idx(i,0)] = grid[idx(i,dim)]; //Copy first real column to right ghost column
grid[idx(i,dim+1)] = grid[idx(i,1)]; //Copy last real column to left ghost column
}

#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
grid[idx(0,j)] = grid[idx(dim,j)]; //Copy first real row to bottom ghost row
grid[idx(dim+1,j)] = grid[idx(1,j)]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = grid[idx(i+1,j)] + grid[idx(i-1,j)]      //upper lower
+ grid[idx(i,j+1)] + grid[idx(i,j-1)]      //right left
+ grid[idx(i+1,j+1)] + grid[idx(i-1,j-1)]  //diagonals
+ grid[idx(i-1,j+1)] + grid[idx(i+1,j-1)];

// Here we have explicitly all of the game rules
if (grid[idx(i,j)] == 1 && numNeighbors < 2)
newGrid[idx(i,j)] = 0;
else if (grid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
newGrid[idx(i,j)] = 1;
else if (grid[idx(i,j)] == 1 && numNeighbors > 3)
newGrid[idx(i,j)] = 0;
else if (grid[idx(i,j)] == 0 && numNeighbors == 3)
newGrid[idx(i,j)] = 1;
else
newGrid[idx(i,j)] = grid[idx(i,j)];
}
}

} else {
#pragma hmppcg gridify
// Left-Right columns
for (i = 1; i<=dim; i++) {
newGrid[idx(i,0)] = newGrid[idx(i,dim)]; //Copy first real column to right ghost column
newGrid[idx(i,dim+1)] = newGrid[idx(i,1)]; //Copy last real column to left ghost column
}

#pragma hmppcg gridify
// Top-Bottom rows
for (j = 0; j<=dim+1; j++) {
newGrid[idx(0,j)] = newGrid[idx(dim,j)]; //Copy first real row to bottom ghost row
newGrid[idx(dim+1,j)] = newGrid[idx(1,j)]; //Copy last real row to top ghost row
}

int numNeighbors;
// Now we loop over all cells and determine their fate
#pragma hmppcg gridify (i,j)
for (i = 1; i<=dim; i++) {
for (j = 1; j<=dim; j++) {
// Get the number of neighbors for a given grid point
numNeighbors = newGrid[idx(i+1,j)] + newGrid[idx(i-1,j)]      //upper lower
+ newGrid[idx(i,j+1)] + newGrid[idx(i,j-1)]      //right left
+ newGrid[idx(i+1,j+1)] + newGrid[idx(i-1,j-1)]  //diagonals
+ newGrid[idx(i-1,j+1)] + newGrid[idx(i+1,j-1)];

// Here we have explicitly all of the game rules
if (newGrid[idx(i,j)] == 1 && numNeighbors < 2)
grid[idx(i,j)] = 0;
else if (newGrid[idx(i,j)] == 1 && (numNeighbors == 2 || numNeighbors == 3))
grid[idx(i,j)] = 1;
else if (newGrid[idx(i,j)] == 1 && numNeighbors > 3)
grid[idx(i,j)] = 0;
else if (newGrid[idx(i,j)] == 0 && numNeighbors == 3)
grid[idx(i,j)] = 1;
else
grid[idx(i,j)] = newGrid[idx(i,j)];
}
}

}
```

A major change to our code has been to get rid of the grid/newGrid switching kernel. We have explicitly coded the two cases and use the modulus of the main iteration variable to switch between the two pieces of code. This increases the code complexity but does get around a costly memory transfer.

```// Read back final configuration to confirm it worked
#pragma hmpp GOL delegatedstore, args[grid]
```

After the kernel is finished we copy the grid from the device to the host with hmpp GOL delegatedstore, args[grid]

```#pragma hmpp GOL release
```

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .

## GOL-codelet-Try2.f90

```!\$hmpp <GOL> group, target=CUDA
!\$hmpp <GOL> mapbyname, dim
!\$hmpp <GOL> map, args[even::grid;odd::newGrid]
!\$hmpp <GOL> map, args[even::newGrid;odd::grid]
!\$hmpp <GOL> even codelet, args[dim,grid,newGrid].transfer=manual
!\$hmpp <GOL> odd  codelet, args[dim,grid,newGrid].transfer=manual
subroutine GOL(dim, grid, newGrid)
implicit none
integer, intent(in) :: dim
integer, intent(inout) :: grid(dim+2,dim+2), newGrid(dim+2,dim+2)
integer :: i,j,numNeighbors

! Top-Bottom ghost rows
do j=2,dim+1
grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)&     !right & left
+ grid(i+1,j) + grid(i-1,j)&     !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)

! Here we have explicitly all of the game rules
if(grid(i,j) == 1 .AND. numNeighbors < 2) then
newGrid(i,j) = 0
elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
newGrid(i,j) = 1
elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
newGrid(i,j) = 0
elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
newGrid(i,j) = 1
else
newGrid(i,j) = grid(i,j)
endif
enddo
enddo
end subroutine GOL

program main
implicit none

integer :: i,j,seed(8),iter,total
real :: randm
! Linear game grid dimension
integer :: dim = 1024
! Number of game iterations
integer :: maxIter = 2**10

! Game grid pointers
integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

!\$hmpp <GOL> allocate
! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
allocate(grid(dim+2,dim+2))
allocate(newGrid(dim+2,dim+2))

seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
call random_seed(PUT=seed)
do j=1,dim
do i=1,dim
call random_number(randm)
grid(i,j) = nint(randm)
enddo
enddo

! Copy memory tp GPU

! Main game loop
do iter=1,maxITer
if(MOD(iter,2) == 1) then
!\$hmpp <GOL> even callsite
call GOL(dim, grid, newGrid)
else
!\$hmpp <GOL> odd callsite
call GOL(dim, grid, newGrid)
endif
enddo!end main game loop

! Read back final configuration to confirm it worked
!\$hmpp <GOL> delegatedstore, args[odd::newGrid]

! Sum up alive cells and print results
total = 0
do j=2,dim+1
do i=2,dim+1
total = total + newGrid(i,j)
enddo
enddo
print *, "Total Alive", total

! Release memory
!\$hmpp <GOL> release
deallocate(grid)
deallocate(newGrid)

end program
```

Changes

```!\$hmpp <GOL> group, target=CUDA
!\$hmpp <GOL> mapbyname, dim
!\$hmpp <GOL> map, args[even::grid;odd::newGrid]
!\$hmpp <GOL> map, args[even::newGrid;odd::grid]
!\$hmpp <GOL> even codelet
!\$hmpp <GOL> odd  codelet
```

A major change to our code has been to get rid of the newGrid to grid memory copy. Instead we use some of the built in HMPP directive functionality.

```!\$hmpp <GOL> group, target=CUDA
```

Here we create what is known as a codelet group, name GOL. This allows us to keep track of similar kernels and set common attributes.

```!\$hmpp <GOL> mapbyname, dim
```

This line will map the variable dim across all kernels. This means once dim is allocated and copied to the device for one kernel it will be available for use by all kernels in the GOL group.

```!\$hmpp <GOL> map, args[even::grid;odd::newGrid]
!\$hmpp <GOL> map, args[even::newGrid;odd::grid]
```

The map directive is used to map arguments between different kernels. This will effectively switch our pointers for us. The notation args[CodeletName1::Argument1,CodeletName2::Argument2] is used such that Argument1 in CodeletName1 will refer to the same device memory as Argument2 in CodeletName2. We see in our case we will soon be defining two kernels, one named even and one named odd.

```!\$hmpp <GOL> even codelet
!\$hmpp <GOL> odd  codelet
```

The GOL group will contain two codelets, even and odd. Our game will oscillate between calling these codelets in the main game loop. The only difference between these codelets is caused by the map directive used above which effectively switches the device pointers to grid and newGrid.

```!\$hmpp <GOL> allocate
```

The allocate directive will allocate space on the GPU for all memory needed by the named group. Any memory objects shared by more than one codelet through the map or mapbyname directives will only be allocated once.

```! Copy memory tp GPU
```

Once we have allocated space on the GPU we can copy the results from the host to the GPU with the advancedload directive. Notice when referencing what arguments will be copied we specify one codelet names, either codelet name could be specified.

```if(MOD(iter,2) == 1) then
!\$hmpp <GOL> even callsite, args[grid].noupdate=true
call GOL(dim, grid, newGrid)
else
!\$hmpp <GOL> odd callsite, args[grid].noupdate=true
call GOL(dim, grid, newGrid)
endif
```

We see that we use our main game loop induction variable to switch between calling our even codelet and our odd codelet. Although it looks as though the same GOL function is being called in both cases because of the map HMPP directive defined with the codelet we know the grid and newGrid GPU pointers will be switched. args[grid].noupdate=true indicates that we do not want to have any implicit data movement between the host and device.

```! Read back final configuration to confirm it worked
!\$hmpp <GOL> delegatedstore, args[odd::newGrid]
```

After the kernel is finished we copy the grid from the device to the host with hmpp GOL delegatedstore, args[grid]

```!\$hmpp <GOL> release
```

Finally since we allocated the device memory we must free it with the directive hmpp GOL release .

## GOL-region-Try2.f90

```program main
implicit none

integer :: i,j,seed(8),iter,numNeighbors,total
real :: randm
! Linear game grid dimension
integer :: dim = 1024
! Number of game iterations
integer :: maxIter = 2**10

! Game grid pointers
integer,dimension(:,:),allocatable :: grid, newGrid, tmpGrid

!\$hmpp GOL allocate, args[grid,newGrid].size={(dim+2),(dim+2)}
! Allocate square grid of (dim+2)^2 elements, 2 added for ghost cells
allocate(grid(dim+2,dim+2))
allocate(newGrid(dim+2,dim+2))

seed = (/1985, 2011, 2012, 500, 24, 15, 99, 8/)
call random_seed(PUT=seed)
do j=1,dim
do i=1,dim
call random_number(randm)
grid(i,j) = nint(randm)
enddo
enddo

! Copy memory tp GPU

! Main game loop
do iter=1,maxITer
!\$hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall
if(MOD(iter,2) == 1) then
! Top-Bottom ghost rows
do j=2,dim+1
grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j,i)
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)&     !right & left
+ grid(i+1,j) + grid(i-1,j)&     !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)

! Here we have explicitly all of the game rules
if(grid(i,j) == 1 .AND. numNeighbors < 2) then
newGrid(i,j) = 0
elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
newGrid(i,j) = 1
elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
newGrid(i,j) = 0
elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
newGrid(i,j) = 1
else
newGrid(i,j) = grid(i,j)
endif
enddo
enddo
else
! Top-Bottom ghost rows
do j=2,dim+1
newGrid(1,j) = newGrid(dim+1,j) !Copy first game grid row to bottom ghost row
newGrid(dim+2,j) = newGrid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
newGrid(i,1) = newGrid(i, dim+1) !Copy first game grid column to right ghost column
newGrid(i,dim+2) = newGrid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j,i)
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = newGrid(i,j+1) + newGrid(i,j-1)&     !right & left
+ newGrid(i+1,j) + newGrid(i-1,j)&     !upper & lower
+ newGrid(i+1,j+1) + newGrid(i-1,j-1)& !diagonals
+ newGrid(i-1,j+1) + newGrid(i+1,j-1)

! Here we have explicitly all of the game rules
if(newGrid(i,j) == 1 .AND. numNeighbors < 2) then
grid(i,j) = 0
elseif(newGrid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
grid(i,j) = 1
elseif(newGrid(i,j) == 1 .AND. numNeighbors > 3) then
grid(i,j) = 0
elseif(newGrid(i,j) == 0 .AND. numNeighbors == 3) then
grid(i,j) = 1
else
grid(i,j) = newGrid(i,j)
endif
enddo
enddo
endif !End grid newGrid switch
!\$hmpp GOL endregion

enddo!end main game loop

! Read back final configuration to confirm it worked
!\$hmpp GOL delegatedstore, args[grid]

! Sum up alive cells and print results
total = 0
do j=2,dim+1
do i=2,dim+1
total = total + grid(i,j)
enddo
enddo
print *, "Total Alive", total

! Release memory
!\$hmpp GOL release
deallocate(grid)
deallocate(newGrid)

end program

```

Changes

```!\$hmpp GOL allocate, args[grid,newGrid].size={(dim+2),(dim+2)}
```

The hmpp GOL allocate directive will allocate space on the GPU for the named region. The compiler requires us to specify the size of the passed in arrays at allocation time with args[grid,newGrid].size={(dim+2),(dim+2)}

```!\$hmpp GOL advancedload, args[dim;grid;newGrid]
```

hmpp GOL advancedload, args[dim;iter;grid;newGrid] will copy the named arguments of the region from the host to the GPU.

```hmpp GOL region, target=CUDA, args[dim,grid,newGrid].transfer=manual, args[iter].transfer=atcall
```

In this version of the code we have our region inside of our main game loop but no longer have to worry about unnecessary memory copies.

Lines 35-109:

```        if(MOD(iter,2) == 1) then
! Top-Bottom ghost rows
do j=2,dim+1
grid(1,j) = grid(dim+1,j) !Copy first game grid row to bottom ghost row
grid(dim+2,j) = grid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
grid(i,1) = grid(i, dim+1) !Copy first game grid column to right ghost column
grid(i,dim+2) = grid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j,i)
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = grid(i,j+1) + grid(i,j-1)&     !right & left
+ grid(i+1,j) + grid(i-1,j)&     !upper & lower
+ grid(i+1,j+1) + grid(i-1,j-1)& !diagonals
+ grid(i-1,j+1) + grid(i+1,j-1)

! Here we have explicitly all of the game rules
if(grid(i,j) == 1 .AND. numNeighbors < 2) then
newGrid(i,j) = 0
elseif(grid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
newGrid(i,j) = 1
elseif(grid(i,j) == 1 .AND. numNeighbors > 3) then
newGrid(i,j) = 0
elseif(grid(i,j) == 0 .AND. numNeighbors == 3) then
newGrid(i,j) = 1
else
newGrid(i,j) = grid(i,j)
endif
enddo
enddo
else
! Top-Bottom ghost rows
do j=2,dim+1
newGrid(1,j) = newGrid(dim+1,j) !Copy first game grid row to bottom ghost row
newGrid(dim+2,j) = newGrid(2,j) !Copy first game grid row to top ghost row
enddo

! Left-Right ghost columns
do i=1,dim+2
newGrid(i,1) = newGrid(i, dim+1) !Copy first game grid column to right ghost column
newGrid(i,dim+2) = newGrid(i,2)  !Copy last game grid column to left ghost column
enddo

! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j,i)
do j=2,dim+1
do i=2,dim+1
! Get the number of neighbors for a given grid point
numNeighbors = newGrid(i,j+1) + newGrid(i,j-1)&     !right & left
+ newGrid(i+1,j) + newGrid(i-1,j)&     !upper & lower
+ newGrid(i+1,j+1) + newGrid(i-1,j-1)& !diagonals
+ newGrid(i-1,j+1) + newGrid(i+1,j-1)

! Here we have explicitly all of the game rules
if(newGrid(i,j) == 1 .AND. numNeighbors < 2) then
grid(i,j) = 0
elseif(newGrid(i,j) == 1 .AND. (numNeighbors == 2 .OR. numNeighbors == 3)) then
grid(i,j) = 1
elseif(newGrid(i,j) == 1 .AND. numNeighbors > 3) then
grid(i,j) = 0
elseif(newGrid(i,j) == 0 .AND. numNeighbors == 3) then
grid(i,j) = 1
else
grid(i,j) = newGrid(i,j)
endif
enddo
enddo
endif !End grid newGrid switch
```

A major change to our code has been to get rid of the grid/newGrid switching kernel. We have explicitly coded the two cases and use the modulus of the main iteration variable to switch between the two pieces of code. This increases the code complexity but does get around a costly memory transfer.

```! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j,i)
```
```! Now we loop over all cells and determine their fate
!\$hmppcg gridify(j,i)
```

By default the hmpp compiler did not want to gridify the loop over each cell. We can attempt to force the compiler to do so adding thehmppcg gridify(j, i) directive which specifies we want to gridify with respect to the induction variables j and i.

```! Read back final configuration to confirm it worked
!\$hmpp GOL delegatedstore, args[grid]
```

After the kernel is finished we copy the grid from the device to the host with hmpp GOL delegatedstore, args[grid]

```! Release memory
!\$hmpp GOL release
```

Finally since we allocated the device memory we must free it with the directive hmpp GOL release.