The following C and Fortran examples show how to combine OpenACC data regions with GPU libraries such as CUFFT. Since files containing CUFFT function calls must be compiled with the nvcc compiler and OpenACC containing files must be compiled with cc/CC/ftn it is necessary to create wrapper functions as shown below. The acc host_data use_device directive will be used which causes the device allocated memory address of specified variables to be used in host code within the directives scope. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov
C/C++
fft.c
#include <stdio.h> #include <stdlib.h> #include <math.h> #include "openacc.h" #ifndef M_PI #define M_PI 3.14159265358979323846 #endif // Forward declaration of wrapper function that will call CUFFT extern void launchCUFFT(float *d_data, int n, void *stream); int main(int argc, char *argv[]) { int n = 256; float *data = malloc(2*n*sizeof(float)); int i; // Initialize interleaved input data on host float w = 7.0; float x; for(i=0; i<2*n; i+=2) { x = (float)i/2.0/(n-1); data[i] = cos(2*M_PI*w*x); data[i+1] = 0.0; } // Copy data to device at start of region and back to host and end of region #pragma acc data copy(data[0:2*n]) { // Inside this region the device data pointer will be used #pragma acc host_data use_device(data) { // Query OpenACC for CUDA stream void *stream = acc_get_cuda_stream(acc_async_sync); // Launch FFT on the GPU launchCUFFT(data, n, stream); } } // Find the frequency int max_id = 0; for(i=0; i<n; i+=2) { if( data[i] > data[max_id] ) max_id = i; } printf("frequency = %d\n", max_id/2); return 0; }
fft.c + cufft.cu
#include <cufft.h> // Declared extern "C" to disable C++ name mangling extern "C" void launchCUFFT(float *d_data, int n, void *stream) { cufftHandle plan; cufftPlan1d(&plan, n, CUFFT_C2C, 1); cufftSetStream(plan, (cudaStream_t)stream); cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
Compiling C/C++
Cray
$ module switch PrgEnv-pgi PrgEnv-cray $ module load craype-accel-nvidia35 $ nvcc cufft.cu -c $ cc -h pragma=acc -lcufft fft.c cufft.o
PGI
$ module load cudatoolkit $ nvcc cufft.cu -c $ cc -acc -lcufft fft.c cufft.o
Fortran
fft.f90
module cufft INTERFACE subroutine launchcufft(data, n, stream) BIND (C, NAME='launchCUFFT') USE ISO_C_BINDING implicit none type (C_PTR), value :: data integer (C_INT), value :: n type (C_PTR), value :: stream end subroutine END INTERFACE end module cufft program fft USE ISO_C_BINDING USE cufft USE openacc IMPLICIT NONE INTEGER, PARAMETER :: n = 256 COMPLEX (C_FLOAT_COMPLEX) :: data(n) INTEGER (C_INT):: i INTEGER :: max_id,istat type (C_PTR) :: stream ! Initialize interleaved input data on host REAL :: w = 7.0 REAL :: x REAL, PARAMETER :: PI = 3.1415927 do i=1,n x = (i-1.0)/(n-1.0); data(i) = CMPLX(COS(2.0*PI*w*x),0.0) enddo ! Copy data to device at start of region and back to host and end of region !$acc data copy(data) ! Inside this region the device data pointer will be used !$acc host_data use_device(data) ! Query OpenACC for CUDA stream stream = acc_get_cuda_stream(acc_async_sync) ! Launch FFT on the GPU call launchcufft(C_LOC(data), n, stream) !$acc end host_data !$acc end data ! Find the frequency max_id = 1 do i=1,n/2 if (REAL(data(i)) .gt. REAL(data(max_id))) then max_id = i-1 endif enddo print *, "frequency:", max_id end program fft
fft.f90 + cufft.cu
#include <cufft.h> // Declared extern "C" to disable C++ name mangling extern "C" void launchCUFFT(float *d_data, int n, void *stream) { cufftHandle plan; cufftPlan1d(&plan, n, CUFFT_C2C, 1); cufftSetStream(plan, (cudaStream_t)stream); cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
Compiling Fortran
Cray
$ module switch PrgEnv-pgi PrgEnv-cray $ module load craype-accel-nvidia35 $ nvcc -c cufft.cu $ ftn -c -hacc fft.f90 $ ftn fft.o cufft.o -lcufft
PGI
$ module load cudatoolkit $ nvcc -c cufft.cu $ ftn -c -acc fft.f90 $ ftn -acc fft.o cufft.o -lcufft