The following C and Fortran examples show how to leverage OpenACC, cuRand, and Thrust in a single application. This example will use cuRand to generate random data, OpenACC to hash the data, and finally Thrust to sort the data into canonical order. Although this example is provided to illustrate OpenACC interoperability only it was written with a simple nearest neighbor hash implementation given interleaved 3D positions(x1,y1,z1,x2,y2,z2…) in mind. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov
C/C++
sort.c
#include <stdio.h> #include <stdlib.h> #include <math.h> #include "openacc.h" // Forward declaration of nvcc compiled functions void fill_rand(float *d_buffer, int num, void *stream); inline int hash_val(float x, float y, float z); void sort(int *d_key, int *d_values, int num, void *stream); int main(int argc, char* argv[]) { int i,index; float x,y,z; int dim = 3; int num = 1000000; int length = dim*num; float *restrict positions = (float*)malloc(length*sizeof(float)); int *restrict keys = (int*)malloc(num*sizeof(int)); int *restrict values = (int*)malloc(num*sizeof(int)); // Query OpenACC for CUDA stream void *stream = acc_get_cuda_stream(acc_async_sync); // OpenACC will create positions, keys, and values arrays on the device #pragma acc data create(positions[0:length], keys[0:num], values[0:num]) { // NVIDIA cuRandom will create our initial random data #pragma acc host_data use_device(positions) { fill_rand(positions, length, stream); } // OpenACC will calculate the hash value for each particle #pragma acc parallel loop for(i=0; i<num; i++) { index = i*3; x = positions[index]; y = positions[index+1]; z = positions[index+2]; // Key is hash value and value is 'particle id' keys[i] = hash_val(x,y,z); values[i] = i; } // Thrust will be used to sort our key value pairs #pragma acc host_data use_device(keys, values) { sort(keys, values, num, stream); } } return 0; } // Uniform grid hash inline int hash_val(float x, float y, float z) { double spacing = 0.01; // Calculate grid coordinates int grid_x,grid_y,grid_z; grid_x = abs(floor(x/spacing)); grid_y = abs(floor(y/spacing)); grid_z = abs(floor(z/spacing)); int num_x = 1.0/spacing; int num_y = 1.0/spacing; int num_z = 1.0/spacing; int grid_position = (num_x * num_y * grid_z) + (grid_y * num_x + grid_x); return grid_position; }
sortGPU.cu
#include <stdio.h> #include <curand.h> #include <thrust/sort.h> #include <thrust/execution_policy.h> #include <thrust/binary_search.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/system_error.h> // Fill d_buffer with num random numbers extern "C" void fill_rand(float *d_buffer, int num, void *stream) { curandGenerator_t gen; int status; // Create generator status = curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); // Set CUDA stream status |= curandSetStream(gen, (cudaStream_t)stream); // Set seed status |= curandSetPseudoRandomGeneratorSeed(gen, 1234ULL); // Generate num random numbers status |= curandGenerateUniform(gen, d_buffer, num); // Cleanup generator status |= curandDestroyGenerator(gen); if (status != CURAND_STATUS_SUCCESS) { printf ("curand failure!\n"); exit (EXIT_FAILURE); } } // Sort key value pairs extern "C" void sort(int *keys, int *values, int num, void *stream) { try { // Sort keys AND values array by key thrust::sort_by_key(thrust::cuda::par.on((cudaStream_t)stream), keys, keys + num, values); } catch(thrust::system_error &e) { std::cerr << "Error sorting with Thrust: " << e.what() << std::endl; exit (EXIT_FAILURE); } }
Compiling C/C++
Cray
$ module switch PrgEnv-pgi PrgEnv-cray $ module load craype-accel-nvidia35 $ nvcc sortGPU.cu -c $ cc -hpragma=acc -lcurand sort.c sortGPU.o
PGI
$ module load cudatoolkit $ nvcc sortGPU.cu -c $ cc -acc -lcurand sort.c sortGPU.o -Minline
Fortran
sort.f90
! Module containing nvcc compiled function declarations and hash function module sortgpu INTERFACE subroutine fill_rand(positions, length, stream) BIND(C,NAME='fill_rand') USE ISO_C_BINDING implicit none type (C_PTR), value :: positions integer (C_INT), value :: length type (C_PTR), value :: stream end subroutine fill_rand subroutine sort(keys, values, num, stream) BIND(C,NAME='sort') USE ISO_C_BINDING implicit none type (C_PTR), value :: keys type (C_PTR), value :: values integer (C_INT), value :: num type (C_PTR), value :: stream end subroutine sort END INTERFACE contains integer function hash_val(x,y,z) real :: x,y,z,spacing integer :: grid_x, grid_y, grid_z integer :: num_x, num_y, num_z spacing = 0.01; grid_x = abs(floor(x/spacing)) grid_y = abs(floor(y/spacing)) grid_z = abs(floor(z/spacing)) num_x = 1.0/spacing num_y = 1.0/spacing num_z = 1.0/spacing hash_val = (num_x * num_y * grid_z) + (grid_y * num_x + grid_x) + 1 return end function hash_val end module sortgpu program interop use ISO_C_BINDING use sortgpu use openacc implicit none integer :: i,indx real :: x,y,z integer, parameter :: dims = 3 integer(C_INT), parameter :: num = 1000000 integer(C_INT), parameter :: length = dims*num real (C_FLOAT) :: positions(length) integer(C_INT) :: keys(num) integer(C_INT) :: values(num) type (C_PTR) :: stream ! Query OpenACC for CUDA stream stream = acc_get_cuda_stream(acc_async_sync) ! OpenACC will create positions, keys, and values arrays on the device !$acc data create(positions, keys, values) ! NVIDIA cuRandom will create our initial random data !$acc host_data use_device(positions) call fill_rand(C_LOC(positions), length, stream) !$acc end host_data ! OpenACC will calculate the hash value for each particle !$acc parallel loop do i=0,num-1 indx = i*3; x = positions(indx+1) y = positions(indx+2) z = positions(indx+3) ! Key is 'particle' id and value is hashed position keys(i+1) = hash_val(x,y,z); values(i+1) = i+1; enddo !$acc end parallel loop ! Thrust will be used to sort our key value pairs !$acc host_data use_device(keys, values) call sort(C_LOC(keys), C_LOC(values), num, stream); !$acc end host_data !$acc end data end program interop
sortGPU.cu
#include <stdio.h> #include <curand.h> #include <thrust/sort.h> #include <thrust/execution_policy.h> #include <thrust/binary_search.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/system_error.h> // Fill d_buffer with num random numbers extern "C" void fill_rand(float *d_buffer, int num, void *stream) { curandGenerator_t gen; int status; // Create generator status = curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); // Set CUDA stream status |= curandSetStream(gen, (cudaStream_t)stream); // Set seed status |= curandSetPseudoRandomGeneratorSeed(gen, 1234ULL); // Generate num random numbers status |= curandGenerateUniform(gen, d_buffer, num); // Cleanup generator status |= curandDestroyGenerator(gen); if (status != CURAND_STATUS_SUCCESS) { printf ("curand failure!\n"); exit (EXIT_FAILURE); } } // Sort key value pairs extern "C" void sort(int *keys, int *values, int num, void *stream) { try { // Sort keys AND values array by key thrust::sort_by_key(thrust::cuda::par.on((cudaStream_t)stream), keys, keys + num, values); } catch(thrust::system_error &e) { std::cerr << "Error sorting with Thrust: " << e.what() << std::endl; exit (EXIT_FAILURE); } }
Compiling Fortran
Cray
$ module switch PrgEnv-pgi PrgEnv-cray $ module load craype-accel-nvidia35 $ nvcc sortGPU.cu -c $ ftn -hacc -lcurand sort.f90 sortGPU.o
PGI
$ module load cudatoolkit $ module switch pgi pgi/13.6.0 $ nvcc sortGPU.cu -c $ ftn -acc -lcurand sort.f90 sortGPU.o -Minline=name:hash_val