Cray’s implementation of MPICH2 allows GPU memory buffers to be passed directly to MPI function calls, eliminating the need to manually copy GPU data to the host before passing data through MPI. Several examples of using this feature are given below. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov.

How to Enable

To enable GPUDirect the following steps must be taken before invoking aprun:

  $ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
  $ export MPICH_RDMA_ENABLED_CUDA=1

Examples

The following examples will initialize memory on the GPU and then preform an MPI_Allgather operation between GPUs.

For these examples use two nodes and request an interactive job.

$qsub -I -A PROJ### -lnodes=2,walltime=00:30:00

The executables must be placed in /tmp/work/$USER to run properly on the GPU.

CUDA C

direct.cpp

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <mpi.h>

int main( int argc, char** argv )
{
    MPI_Init (&argc, &argv);

    int direct;
    int rank, size;
    int *h_buff = NULL;
    int *d_rank = NULL;
    int *d_buff = NULL;
    size_t bytes;
    int i;

    // Ensure that RDMA ENABLED CUDA is set correctly
    direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));
    if(direct != 1){
        printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
        exit (EXIT_FAILURE);
    }

    // Get MPI rank and size
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    MPI_Comm_size (MPI_COMM_WORLD, &size);

    // Allocate host and device buffers and copy rank value to GPU
    bytes = size*sizeof(int);
    h_buff = (int*)malloc(bytes);
    cudaMalloc(&d_buff, bytes);
    cudaMalloc(&d_rank, sizeof(int));
    cudaMemcpy(d_rank, &rank, sizeof(int), cudaMemcpyHostToDevice);

    // Preform Allgather using device buffer
    MPI_Allgather(d_rank, 1, MPI_INT, d_buff, 1, MPI_INT, MPI_COMM_WORLD);

    // Check that the GPU buffer is correct
    cudaMemcpy(h_buff, d_buff, bytes, cudaMemcpyDeviceToHost);
    for(i=0; i<size; i++){
        if(h_buff[i] != i) {
            printf ("Alltoall Failed!\n");
            exit (EXIT_FAILURE);
        }
    }
    if(rank==0)
        printf("Success!\n");

    // Clean up
    free(h_buff);
    cudaFree(d_buff);
    cudaFree(d_rank);
    MPI_Finalize();

    return 0;
}

Compiling CUDA C

For ease of compiling the GNU environment will be used.

  $ module load cudatoolkit
  $ module switch PrgEnv-pgi PrgEnv-gnu
  $ CC -lcudart direct.cpp -o direct.out

Running CUDA C

  $ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
  $ export MPICH_RDMA_ENABLED_CUDA=1
  $ aprun -n2 -N1 ./direct.out

CUDA Fortran

direct.cuf

program GPUdirect
    use cudafor
    implicit none
    include 'mpif.h'

    integer :: direct
    character(len=255) :: env_var
    integer :: rank, size, ierror
    integer,dimension(:),allocatable :: h_buff
    integer,device :: d_rank
    integer,dimension(:),allocatable,device :: d_buff
    integer :: i

    call getenv("MPICH_RDMA_ENABLED_CUDA", env_var)
    read( env_var, '(i10)' ) direct
    if (direct .NE. 1) then
      print *, 'MPICH_RDMA_ENABLED_CUDA not enabled!'
      call exit(1)
    endif

    call MPI_INIT(ierror)

    ! Get MPI rank and size
    call MPI_COMM_RANK (MPI_COMM_WORLD, rank, ierror)
    call MPI_COMM_SIZE (MPI_COMM_WORLD, size, ierror)

    ! Initialize host and device buffers
    allocate(h_buff(size))
    allocate(d_buff(size))
    ! Implicity copy rank to device
    d_rank = rank

    ! Preform allgather using device buffers
    call MPI_ALLGATHER(d_rank, 1, MPI_INTEGER, d_buff, 1, MPI_INTEGER, MPI_COMM_WORLD, ierror);

    ! Check that buffer is correct
    h_buff = d_buff(1:size)
    do i=1,size
        if (h_buff(i) .NE. i-1) then
            print *, 'Alltoall Failed!'
            call exit(1)
        endif
    enddo
    if (rank .EQ. 0) then
        print *, 'Success!'
    endif

    ! Clean up
    deallocate(h_buff)
    deallocate(d_buff)
    call MPI_FINALIZE(ierror)

end program GPUdirect

Compiling CUDA Fortran

  $ module load cudatoolkit
  $ ftn direct.cuf -o direct.out

Running CUDA Fortran

  $ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
  $ export MPICH_RDMA_ENABLED_CUDA=1
  $ aprun -n2 -N1 ./direct.out

OpenACC C

direct.c

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>

int main( int argc, char** argv )
{
    MPI_Init (&argc, &argv);

    int direct;
    int rank, size;
    int *restrict buff = NULL;
    size_t bytes;
    int i;

    // Ensure that RDMA ENABLED CUDA is set correctly
    direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA"));
    if(direct != 1){
        printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n");
        exit (EXIT_FAILURE);
    }

    // Get MPI rank and size
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    MPI_Comm_size (MPI_COMM_WORLD, &size);

    // Initialize buffer
    bytes = size*sizeof(int);
    buff = (int*)malloc(bytes);

    // Copy buff to device at start of region and back to host and end of region
    #pragma acc data copy(rank, buff[0:size])
    {
        // Inside this region the device data pointer will be used
        #pragma acc host_data use_device(rank, buff)
        {
            MPI_Allgather(&rank, 1, MPI_INT, buff, 1, MPI_INT, MPI_COMM_WORLD);
        }
    }

    // Check that buffer is correct
    for(i=0; i<size; i++){
        if(buff[i] != i) {
            printf ("Alltoall Failed!\n");
            exit (EXIT_FAILURE);
        }
    }
    if(rank==0)
        printf("Success!\n");

    // Clean up
    free(buff);

    MPI_Finalize();

    return 0;
}

Compiling OpenACC C

CRAY

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ cc -hpragma=acc direct.c -o direct.out

PGI

$ module load cudatoolkit
$ cc -acc -lcudart direct.c -o direct.out

Running OpenACC C

CRAY

$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
$ export MPICH_RDMA_ENABLED_CUDA=1
$ aprun -n2 -N1 ./direct.out

PGI

$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
$ export MPICH_RDMA_ENABLED_CUDA=1
$ aprun -n2 -N1 ./direct.out

OpenACC Fortran

direct.f90

program GPUdirect
    include 'mpif.h'

    integer :: direct
    character(len=255) :: env_var
    integer :: rank, size, ierror
    integer,dimension(:),allocatable :: buff
    integer :: i

    call getenv("MPICH_RDMA_ENABLED_CUDA", env_var)
    read( env_var, '(i10)' ) direct
    if (direct .NE. 1) then
      print *, 'MPICH_RDMA_ENABLED_CUDA not enabled!'
      call exit(1)
    endif

    call MPI_INIT(ierror)

    ! Get MPI rank and size
    call MPI_COMM_RANK (MPI_COMM_WORLD, rank, ierror)
    call MPI_COMM_SIZE (MPI_COMM_WORLD, size, ierror)

    ! Initialize buffer
    allocate(buff(size))

    ! Copy buff to device at start of region and back to host and end of region
    !$acc data copy(rank, buff(1:size))
        ! Inside this region the device data pointer will be used
        !$acc host_data use_device(rank, buff)
            ! Preform all to all using device buffer
            call MPI_ALLGATHER(rank, 1, MPI_INT, buff, 1, MPI_INT, MPI_COMM_WORLD, ierror);
        !$acc end host_data
    !$acc end data

    ! Check that buffer is correct
    do i=1,size
        if (buff(i) .NE. i-1) then
            print *, 'Alltoall Failed!'
            call exit(1)
        endif
    enddo
    if (rank .EQ. 0) then
        print *, 'Success!'
    endif

    ! Clean up
    deallocate(buff)
    call MPI_FINALIZE(ierror)
end program GPUdirect

Compiling OpenACC Fortran

CRAY

$ module switch PrgEnv-pgi PrgEnv-cray
$ module load craype-accel-nvidia35
$ ftn -hacc direct.f90 -o direct.out

PGI

$ module load cudatoolkit
$ ftn -acc -lcudart direct.f90 -o direct.out

Running OpenACC Fortran

CRAY

$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
$ export MPICH_RDMA_ENABLED_CUDA=1
$ aprun -n2 -N1 ./direct.out

PGI

$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
$ export MPICH_RDMA_ENABLED_CUDA=1
$ aprun -n2 -N1 ./direct.out

Optimizations

Several optimizations for improving performance are given below. These optimizations are highly application dependent and may require some trial and error tuning to achieve best results.

Pipelining

Pipelining allows for overlapping of GPU to GPU MPI messages and may improve message passing performance for large bandwidth bound messages. Setting the environment variable MPICH_G2G_PIPELINE=N allows a maximum of N GPU to GPU messages to be in flight at any given time. The default value of MPICH_G2G_PIPELINE is 16 and messages under 8 Kilobytes in size are never pipelined.

Nemesis

Applications using asynchronous MPI calls may benefit from enabling the MPICH asynchronous progress feature. Setting the MPICH_NEMESIS_ASYNC_PROGRESS=1 environment variable enables additional threads to be spawned to progress the MPI state.

This feature requires that the thread level be set to multiple: MPICH_MAX_THREAD_SAFETY=multiple.

This feature works best when used in conjunction with core specialization: aprun -r N, which allows for N CPU cores to be reserved for system services.