The following C and Fortran examples show how to combine OpenACC data regions with GPU libraries such as CUFFT. Since files containing CUFFT function calls must be compiled with the nvcc compiler and OpenACC containing files must be compiled with cc/CC/ftn it is necessary to create wrapper functions as shown below. The acc host_data use_device directive will be used which causes the device allocated memory address of specified variables to be used in host code within the directives scope. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

Care must be taken to ensure that all dependent GPU operations occur on the same CUDA stream. The example below queries OpenACC for the stream in use and executes all library calls within this stream. It is an error to assume OpenACC is using the default stream and doing so may very well produce difficult to detect race conditions.

C/C++

fft.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "openacc.h"

#ifndef M_PI
#define M_PI           3.14159265358979323846
#endif

// Forward declaration of wrapper function that will call CUFFT
extern void launchCUFFT(float *d_data, int n, void *stream);

int main(int argc, char *argv[])
{
    int n = 256;
    float *data = malloc(2*n*sizeof(float));
    int i;

    // Initialize interleaved input data on host
    float w = 7.0;
    float x;
    for(i=0; i<2*n; i+=2)  {
        x = (float)i/2.0/(n-1);
        data[i] = cos(2*M_PI*w*x);
        data[i+1] = 0.0;
    }

    // Copy data to device at start of region and back to host and end of region
    #pragma acc data copy(data[0:2*n])
    {
        // Inside this region the device data pointer will be used
        #pragma acc host_data use_device(data)
        {
           // Query OpenACC for CUDA stream
           void *stream = acc_get_cuda_stream(acc_async_sync);

           // Launch FFT on the GPU
           launchCUFFT(data, n, stream);
        }
    }

    // Find the frequency
    int max_id = 0;
    for(i=0; i<n; i+=2) {
        if( data[i] > data[max_id] )
            max_id = i;
    }
    printf("frequency = %d\n", max_id/2);

    return 0;
}

fft.c + cufft.cu

#include <cufft.h>

// Declared extern "C" to disable C++ name mangling
extern "C" void launchCUFFT(float *d_data, int n, void *stream)
{
    cufftHandle plan;
    cufftPlan1d(&plan, n, CUFFT_C2C, 1);
    cufftSetStream(plan, (cudaStream_t)stream);
    cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
    cufftDestroy(plan);
}

Compiling C/C++

Cray

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ nvcc cufft.cu -c
$ cc -h pragma=acc -lcufft fft.c cufft.o

PGI

$ module load cudatoolkit
$ nvcc cufft.cu -c
$ cc -acc -lcufft fft.c cufft.o

Fortran

fft.f90

module cufft
  INTERFACE
    subroutine launchcufft(data, n, stream) BIND (C, NAME='launchCUFFT')
      USE ISO_C_BINDING
      implicit none
      type (C_PTR), value :: data
      integer (C_INT), value :: n
      type (C_PTR), value :: stream
    end subroutine
  END INTERFACE
end module cufft

program fft
    USE ISO_C_BINDING
    USE cufft
    USE openacc
    IMPLICIT NONE

    INTEGER, PARAMETER :: n = 256
    COMPLEX (C_FLOAT_COMPLEX) :: data(n)
    INTEGER (C_INT):: i
    INTEGER :: max_id,istat
    type (C_PTR) :: stream

    ! Initialize interleaved input data on host
    REAL :: w = 7.0
    REAL :: x
    REAL, PARAMETER :: PI = 3.1415927
    do i=1,n
        x = (i-1.0)/(n-1.0);
        data(i) = CMPLX(COS(2.0*PI*w*x),0.0)
    enddo

    ! Copy data to device at start of region and back to host and end of region
    !$acc data copy(data)

        ! Inside this region the device data pointer will be used
        !$acc host_data use_device(data)

        ! Query OpenACC for CUDA stream
        stream = acc_get_cuda_stream(acc_async_sync)

        ! Launch FFT on the GPU
        call launchcufft(C_LOC(data), n, stream)
        !$acc end host_data

    !$acc end data

    ! Find the frequency
    max_id = 1
    do i=1,n/2
        if (REAL(data(i)) .gt. REAL(data(max_id))) then
            max_id = i-1
        endif
    enddo
    print *, "frequency:", max_id

end program fft

fft.f90 + cufft.cu

#include <cufft.h>

// Declared extern "C" to disable C++ name mangling
extern "C" void launchCUFFT(float *d_data, int n, void *stream)
{
    cufftHandle plan;
    cufftPlan1d(&plan, n, CUFFT_C2C, 1);
    cufftSetStream(plan, (cudaStream_t)stream);
    cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
    cufftDestroy(plan);
}

Compiling Fortran

Cray

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ nvcc -c cufft.cu
$ ftn -c -hacc fft.f90
$ ftn fft.o cufft.o -lcufft

PGI

$ module load cudatoolkit
$ nvcc -c cufft.cu
$ ftn -c -acc fft.f90
$ ftn -acc fft.o cufft.o -lcufft