Cray’s implementation of MPICH2 allows GPU memory buffers to be passed directly to MPI function calls, eliminating the need to manually copy GPU data to the host before passing data through MPI. Several examples of using this feature are given below. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov.
How to Enable
To enable GPUDirect the following steps must be taken before invoking aprun:
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1
Examples
The following examples will initialize memory on the GPU and then preform an MPI_Allgather operation between GPUs.
For these examples use two nodes and request an interactive job.
$qsub -I -A PROJ### -lnodes=2,walltime=00:30:00
The executables must be placed in /tmp/work/$USER to run properly on the GPU.
CUDA C
direct.cpp
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <mpi.h> int main( int argc, char** argv ) { MPI_Init (&argc, &argv); int direct; int rank, size; int *h_buff = NULL; int *d_rank = NULL; int *d_buff = NULL; size_t bytes; int i; // Ensure that RDMA ENABLED CUDA is set correctly direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA")); if(direct != 1){ printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n"); exit (EXIT_FAILURE); } // Get MPI rank and size MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &size); // Allocate host and device buffers and copy rank value to GPU bytes = size*sizeof(int); h_buff = (int*)malloc(bytes); cudaMalloc(&d_buff, bytes); cudaMalloc(&d_rank, sizeof(int)); cudaMemcpy(d_rank, &rank, sizeof(int), cudaMemcpyHostToDevice); // Preform Allgather using device buffer MPI_Allgather(d_rank, 1, MPI_INT, d_buff, 1, MPI_INT, MPI_COMM_WORLD); // Check that the GPU buffer is correct cudaMemcpy(h_buff, d_buff, bytes, cudaMemcpyDeviceToHost); for(i=0; i<size; i++){ if(h_buff[i] != i) { printf ("Alltoall Failed!\n"); exit (EXIT_FAILURE); } } if(rank==0) printf("Success!\n"); // Clean up free(h_buff); cudaFree(d_buff); cudaFree(d_rank); MPI_Finalize(); return 0; }
Compiling CUDA C
For ease of compiling the GNU environment will be used.
$ module load cudatoolkit $ module switch PrgEnv-pgi PrgEnv-gnu $ CC -lcudart direct.cpp -o direct.out
Running CUDA C
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1 $ aprun -n2 -N1 ./direct.out
CUDA Fortran
direct.cuf
program GPUdirect use cudafor implicit none include 'mpif.h' integer :: direct character(len=255) :: env_var integer :: rank, size, ierror integer,dimension(:),allocatable :: h_buff integer,device :: d_rank integer,dimension(:),allocatable,device :: d_buff integer :: i call getenv("MPICH_RDMA_ENABLED_CUDA", env_var) read( env_var, '(i10)' ) direct if (direct .NE. 1) then print *, 'MPICH_RDMA_ENABLED_CUDA not enabled!' call exit(1) endif call MPI_INIT(ierror) ! Get MPI rank and size call MPI_COMM_RANK (MPI_COMM_WORLD, rank, ierror) call MPI_COMM_SIZE (MPI_COMM_WORLD, size, ierror) ! Initialize host and device buffers allocate(h_buff(size)) allocate(d_buff(size)) ! Implicity copy rank to device d_rank = rank ! Preform allgather using device buffers call MPI_ALLGATHER(d_rank, 1, MPI_INTEGER, d_buff, 1, MPI_INTEGER, MPI_COMM_WORLD, ierror); ! Check that buffer is correct h_buff = d_buff(1:size) do i=1,size if (h_buff(i) .NE. i-1) then print *, 'Alltoall Failed!' call exit(1) endif enddo if (rank .EQ. 0) then print *, 'Success!' endif ! Clean up deallocate(h_buff) deallocate(d_buff) call MPI_FINALIZE(ierror) end program GPUdirect
Compiling CUDA Fortran
$ module load cudatoolkit $ ftn direct.cuf -o direct.out
Running CUDA Fortran
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1 $ aprun -n2 -N1 ./direct.out
OpenACC C
direct.c
#include <stdio.h> #include <stdlib.h> #include <mpi.h> int main( int argc, char** argv ) { MPI_Init (&argc, &argv); int direct; int rank, size; int *restrict buff = NULL; size_t bytes; int i; // Ensure that RDMA ENABLED CUDA is set correctly direct = getenv("MPICH_RDMA_ENABLED_CUDA")==NULL?0:atoi(getenv ("MPICH_RDMA_ENABLED_CUDA")); if(direct != 1){ printf ("MPICH_RDMA_ENABLED_CUDA not enabled!\n"); exit (EXIT_FAILURE); } // Get MPI rank and size MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &size); // Initialize buffer bytes = size*sizeof(int); buff = (int*)malloc(bytes); // Copy buff to device at start of region and back to host and end of region #pragma acc data copy(rank, buff[0:size]) { // Inside this region the device data pointer will be used #pragma acc host_data use_device(rank, buff) { MPI_Allgather(&rank, 1, MPI_INT, buff, 1, MPI_INT, MPI_COMM_WORLD); } } // Check that buffer is correct for(i=0; i<size; i++){ if(buff[i] != i) { printf ("Alltoall Failed!\n"); exit (EXIT_FAILURE); } } if(rank==0) printf("Success!\n"); // Clean up free(buff); MPI_Finalize(); return 0; }
Compiling OpenACC C
CRAY
$ module switch PrgEnv-pgi PrgEnv-cray $ module load craype-accel-nvidia35 $ cc -hpragma=acc direct.c -o direct.out
PGI
$ module load cudatoolkit $ cc -acc -lcudart direct.c -o direct.out
Running OpenACC C
CRAY
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1 $ aprun -n2 -N1 ./direct.out
PGI
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1 $ aprun -n2 -N1 ./direct.out
OpenACC Fortran
direct.f90
program GPUdirect include 'mpif.h' integer :: direct character(len=255) :: env_var integer :: rank, size, ierror integer,dimension(:),allocatable :: buff integer :: i call getenv("MPICH_RDMA_ENABLED_CUDA", env_var) read( env_var, '(i10)' ) direct if (direct .NE. 1) then print *, 'MPICH_RDMA_ENABLED_CUDA not enabled!' call exit(1) endif call MPI_INIT(ierror) ! Get MPI rank and size call MPI_COMM_RANK (MPI_COMM_WORLD, rank, ierror) call MPI_COMM_SIZE (MPI_COMM_WORLD, size, ierror) ! Initialize buffer allocate(buff(size)) ! Copy buff to device at start of region and back to host and end of region !$acc data copy(rank, buff(1:size)) ! Inside this region the device data pointer will be used !$acc host_data use_device(rank, buff) ! Preform all to all using device buffer call MPI_ALLGATHER(rank, 1, MPI_INT, buff, 1, MPI_INT, MPI_COMM_WORLD, ierror); !$acc end host_data !$acc end data ! Check that buffer is correct do i=1,size if (buff(i) .NE. i-1) then print *, 'Alltoall Failed!' call exit(1) endif enddo if (rank .EQ. 0) then print *, 'Success!' endif ! Clean up deallocate(buff) call MPI_FINALIZE(ierror) end program GPUdirect
Compiling OpenACC Fortran
CRAY
$ module switch PrgEnv-pgi PrgEnv-cray $ module load craype-accel-nvidia35 $ ftn -hacc direct.f90 -o direct.out
PGI
$ module load cudatoolkit $ ftn -acc -lcudart direct.f90 -o direct.out
Running OpenACC Fortran
CRAY
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1 $ aprun -n2 -N1 ./direct.out
PGI
$ export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $ export MPICH_RDMA_ENABLED_CUDA=1 $ aprun -n2 -N1 ./direct.out
Optimizations
Several optimizations for improving performance are given below. These optimizations are highly application dependent and may require some trial and error tuning to achieve best results.
Pipelining
Pipelining allows for overlapping of GPU to GPU MPI messages and may improve message passing performance for large bandwidth bound messages. Setting the environment variable MPICH_G2G_PIPELINE=N allows a maximum of N GPU to GPU messages to be in flight at any given time. The default value of MPICH_G2G_PIPELINE is 16 and messages under 8 Kilobytes in size are never pipelined.
Nemesis
Applications using asynchronous MPI calls may benefit from enabling the MPICH asynchronous progress feature. Setting the MPICH_NEMESIS_ASYNC_PROGRESS=1 environment variable enables additional threads to be spawned to progress the MPI state.
This feature requires that the thread level be set to multiple: MPICH_MAX_THREAD_SAFETY=multiple.
This feature works best when used in conjunction with core specialization: aprun -r N, which allows for N CPU cores to be reserved for system services.