The following C and Fortran examples show how to leverage OpenACC, cuRand, and Thrust in a single application. This example will use cuRand to generate random data, OpenACC to hash the data, and finally Thrust to sort the data into canonical order. Although this example is provided to illustrate OpenACC interoperability only it was written with a simple nearest neighbor hash implementation given interleaved 3D positions(x1,y1,z1,x2,y2,z2…) in mind. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

Care must be taken to ensure that all dependent GPU operations occur on the same CUDA stream. The example below queries OpenACC for the stream in use and executes all library calls within this stream. It is an error to assume OpenACC is using the default stream and doing so may very well produce difficult to detect race conditions.

C/C++

sort.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "openacc.h"

// Forward declaration of nvcc compiled functions
void fill_rand(float *d_buffer, int num, void *stream);
inline int hash_val(float x, float y, float z);
void sort(int *d_key, int *d_values, int num, void *stream);

int main(int argc, char* argv[])
{
    int i,index;
    float x,y,z;
    int dim = 3;
    int num = 1000000;
    int length = dim*num;
    float *restrict positions = (float*)malloc(length*sizeof(float));
    int *restrict keys   = (int*)malloc(num*sizeof(int));
    int *restrict values = (int*)malloc(num*sizeof(int));

    // Query OpenACC for CUDA stream
    void *stream = acc_get_cuda_stream(acc_async_sync);

    // OpenACC will create positions, keys, and values arrays on the device
    #pragma acc data create(positions[0:length], keys[0:num], values[0:num])
    {
        // NVIDIA cuRandom will create our initial random data
        #pragma acc host_data use_device(positions)
        {
             fill_rand(positions, length, stream);
        }

        // OpenACC will calculate the hash value for each particle
        #pragma acc parallel loop
        for(i=0; i<num; i++) {
            index = i*3;
            x = positions[index];
            y = positions[index+1];
            z = positions[index+2];

            // Key is hash value and value is 'particle id'
            keys[i] = hash_val(x,y,z);
            values[i] = i;
        }

        // Thrust will be used to sort our key value pairs
        #pragma acc host_data use_device(keys, values)
        {
            sort(keys, values, num, stream);
        }
    }

    return 0;
}

// Uniform grid hash
inline int hash_val(float x, float y, float z)
{
    double spacing = 0.01;

    // Calculate grid coordinates
    int grid_x,grid_y,grid_z;
    grid_x = abs(floor(x/spacing));
    grid_y = abs(floor(y/spacing));
    grid_z = abs(floor(z/spacing));

    int num_x = 1.0/spacing;
    int num_y = 1.0/spacing;
    int num_z = 1.0/spacing;

    int grid_position = (num_x * num_y * grid_z) + (grid_y * num_x + grid_x);
    return grid_position;
}

sortGPU.cu

#include <stdio.h>
#include <curand.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/system_error.h>

// Fill d_buffer with num random numbers
extern "C" void fill_rand(float *d_buffer, int num, void *stream)
{
  curandGenerator_t gen;
  int status;

  // Create generator
  status = curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);

  // Set CUDA stream
  status |= curandSetStream(gen, (cudaStream_t)stream);

  // Set seed
  status |= curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);

  // Generate num random numbers
  status |= curandGenerateUniform(gen, d_buffer, num);

  // Cleanup generator
  status |= curandDestroyGenerator(gen);

  if (status != CURAND_STATUS_SUCCESS) {
      printf ("curand failure!\n");
      exit (EXIT_FAILURE);
  }
}

// Sort key value pairs
extern "C" void sort(int *keys, int *values, int num, void *stream)
{
    try {
        // Sort keys AND values array by key
        thrust::sort_by_key(thrust::cuda::par.on((cudaStream_t)stream),
                            keys, keys + num, values);
    }
    catch(thrust::system_error &e) {
        std::cerr << "Error sorting with Thrust: " << e.what() << std::endl;
        exit (EXIT_FAILURE);
    }
}

Compiling C/C++

Cray

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ nvcc sortGPU.cu -c
$ cc -hpragma=acc -lcurand sort.c sortGPU.o

PGI

$ module load cudatoolkit

$ nvcc sortGPU.cu -c
$ cc -acc -lcurand sort.c sortGPU.o -Minline

Fortran

sort.f90

! Module containing nvcc compiled function declarations and hash function
module sortgpu
  INTERFACE
    subroutine fill_rand(positions, length, stream) BIND(C,NAME='fill_rand')
      USE ISO_C_BINDING
      implicit none
      type (C_PTR), value :: positions
      integer (C_INT), value :: length
      type (C_PTR), value :: stream
    end subroutine fill_rand
 
    subroutine sort(keys, values, num, stream) BIND(C,NAME='sort')
      USE ISO_C_BINDING
      implicit none
      type (C_PTR), value :: keys
      type (C_PTR), value :: values
      integer (C_INT), value :: num
      type (C_PTR), value :: stream
    end subroutine sort
  END INTERFACE
 
  contains
    integer function hash_val(x,y,z)
      real :: x,y,z,spacing
      integer :: grid_x, grid_y, grid_z
      integer :: num_x, num_y, num_z
 
      spacing =  0.01;
 
      grid_x = abs(floor(x/spacing))
      grid_y = abs(floor(y/spacing))
      grid_z = abs(floor(z/spacing))
 
      num_x = 1.0/spacing
      num_y = 1.0/spacing
      num_z = 1.0/spacing
 
      hash_val =  (num_x * num_y * grid_z) + (grid_y * num_x + grid_x) + 1
      return
    end function hash_val
end module sortgpu
 
program interop
    use ISO_C_BINDING
    use sortgpu
    use openacc
    implicit none
 
    integer :: i,indx
    real :: x,y,z
    integer, parameter :: dims = 3
    integer(C_INT), parameter :: num = 1000000
    integer(C_INT), parameter :: length = dims*num
    real (C_FLOAT) :: positions(length)
    integer(C_INT) :: keys(num)
    integer(C_INT) :: values(num)
    type (C_PTR) :: stream

    ! Query OpenACC for CUDA stream
    stream = acc_get_cuda_stream(acc_async_sync)

    ! OpenACC will create positions, keys, and values arrays on the device
    !$acc data create(positions, keys, values)
 
        ! NVIDIA cuRandom will create our initial random data
        !$acc host_data use_device(positions)
        call fill_rand(C_LOC(positions), length, stream)
        !$acc end host_data
 
        ! OpenACC will calculate the hash value for each particle
        !$acc parallel loop
        do i=0,num-1
            indx = i*3;
            x = positions(indx+1)
            y = positions(indx+2)
            z = positions(indx+3)
 
            ! Key is 'particle' id and value is hashed position
            keys(i+1) = hash_val(x,y,z);
            values(i+1) = i+1;
        enddo
        !$acc end parallel loop
 
        ! Thrust will be used to sort our key value pairs
        !$acc host_data use_device(keys, values)
        call sort(C_LOC(keys), C_LOC(values), num, stream);
        !$acc end host_data
 
    !$acc end data    
 
end program interop

sortGPU.cu

#include <stdio.h>
#include <curand.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/system_error.h>

// Fill d_buffer with num random numbers
extern "C" void fill_rand(float *d_buffer, int num, void *stream)
{
  curandGenerator_t gen;
  int status;

  // Create generator
  status = curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);

  // Set CUDA stream
  status |= curandSetStream(gen, (cudaStream_t)stream);

  // Set seed
  status |= curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);

  // Generate num random numbers
  status |= curandGenerateUniform(gen, d_buffer, num);

  // Cleanup generator
  status |= curandDestroyGenerator(gen);

  if (status != CURAND_STATUS_SUCCESS) {
      printf ("curand failure!\n");
      exit (EXIT_FAILURE);
  }
}

// Sort key value pairs
extern "C" void sort(int *keys, int *values, int num, void *stream)
{
    try {
        // Sort keys AND values array by key
        thrust::sort_by_key(thrust::cuda::par.on((cudaStream_t)stream),
                            keys, keys + num, values);
    }
    catch(thrust::system_error &e) {
        std::cerr << "Error sorting with Thrust: " << e.what() << std::endl;
        exit (EXIT_FAILURE);
    }
}

Compiling Fortran

Cray

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ nvcc sortGPU.cu -c
$ ftn -hacc -lcurand sort.f90 sortGPU.o

PGI

$ module load cudatoolkit
$ module switch pgi pgi/13.6.0

$ nvcc sortGPU.cu -c
$ ftn -acc -lcurand sort.f90 sortGPU.o -Minline=name:hash_val