To fully exploit the computational power of the GPU generally a large amount of data parallelism must be expressed. In the specific case of accelerated libraries such as cuBLAS, cuFFT, and cuSPARSE if each operation does not possess a sufficient amount of data parallelism another option is to batch many smaller operations into a single large operation. This tutorial will demonstrate how to take advantage of batched cuBLAS operations to improve GPU utilization. Additionally this tutorial will expand on the GPU concurrency topics from the first tutorial through the use of streams and Hyper-Q. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov


C

Note: cuBLAS uses column major ordering and 1-based indexing. The following examples utilize square diagonal matrices for simplicity and do not directly address this issue.

A Serial Example in C

The following example preforms a single DGEMM operation using the cuBLAS version 2 library. This example will be expanded to show the use of batched cuBLAS calls under a variety of situations. The form of our sample DGEMM operation will be as follows:

DGEMM

cublas.c

#include <stdio.h>
#include <stdlib.h>
#include "math.h"
#include "cublas_v2.h"

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int dim = 100;

    // Allocate host storage for A,B,C square matrices
    double *A, *B, *C;
    A = (double*)malloc(dim*dim*sizeof(double));
    B = (double*)malloc(dim*dim*sizeof(double));
    C = (double*)malloc(dim*dim*sizeof(double));

    // Allocate device storage for A,B,C
    double *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, dim*dim*sizeof(double));
    cudaMalloc((void**)&d_B, dim*dim*sizeof(double));
    cudaMalloc((void**)&d_C, dim*dim*sizeof(double));

    // Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2
    // Matrices are arranged column major
    int i,j,index;
    for(j=0; j<dim; j++) {
        for(i=0; i<dim; i++) {
            index = j*dim + i;
            if(i==j) {
                A[index] = sin(index);
                B[index] = sin(index);
                C[index] = cos(index)*cos(index);
            }
	    else {
                A[index] = 0.0;
                B[index] = 0.0;
                C[index] = 0.0;
            }
        }    
    }

    // Create cublas instance
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set input matrices on device
    cublasSetMatrix(dim, dim, sizeof(double), A, dim, d_A, dim);
    cublasSetMatrix(dim, dim, sizeof(double), B, dim, d_B, dim);
    cublasSetMatrix(dim, dim, sizeof(double), C, dim, d_C, dim);

    // DGEMM: C = alpha*A*B + beta*C
    double alpha = 1.0;
    double beta  = 1.0;

    cublasDgemm(handle,
                CUBLAS_OP_N, CUBLAS_OP_N,
                dim, dim, dim,
                &alpha,
                d_A, dim,
                d_B, dim,
                &beta,
                d_C, dim);

    // Retrieve result matrix from device
    cublasGetMatrix(dim, dim, sizeof(double), d_C, dim, C, dim);

    // Simple sanity test, sum up all elements
    double sum = 0;
    for(j=0; j<dim; j++) {
        for(i=0; i<dim; i++) {
            index = j*dim + i;
            sum += C[index];
        }
    }
    printf("Sum is: %f, should be: %d\n", sum, dim);   

    // Clean up resources
    free(A);
    free(B);
    free(C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);

    return 0;
}
For simplicity the following compiles will use the GNU programming environment. For more information on using non GNU compilers please see the Compiling Mixed GPU and CPU code tutorial

Compile:

$ module switch PrgEnv-pgi PrgEnv-gnu
$ module load cudatoolkit
$ cc -lcublas cublas.c -o cublas.out

Run:

$ module load cudatoolkit
$ aprun ./cublas.out

Library-provided Batching in C

The most efficient way to increase GPU utilization for small library operations is to use library provided batch capabilities. In the code sample below we use cublasDgemmBatched, which takes as an argument an array of pointers to matrices and the number of DGEMM operations to preform in total. By allowing the library to handle the batching, high performance can be achieved with minimal additional effort on the part of the programmer.

cublas-batch.c

#include <stdio.h>
#include <stdlib.h>
#include "math.h"
#include "cublas_v2.h"

int main(int argc, char* argv[])
{
    int i,j,k,index;

    // Linear dimension of matrices
    int dim = 100;

    // Number of A,B,C matrix sets
    int batch_count = 1000;

    // Allocate host storage for batch_count A,B,C square matrices
    double **A, **B, **C;
    A = (double**)malloc(batch_count*sizeof(double*));
    B = (double**)malloc(batch_count*sizeof(double*));
    C = (double**)malloc(batch_count*sizeof(double*));
    for(i=0; i<batch_count; i++) {
        A[i] = (double*)malloc(dim*dim*sizeof(double));
        B[i] = (double*)malloc(dim*dim*sizeof(double));
        C[i] = (double*)malloc(dim*dim*sizeof(double));
    }

    // Create host pointer array to device matrix storage
    double **d_A, **d_B, **d_C, **h_d_A, **h_d_B, **h_d_C;
    h_d_A = (double**)malloc(batch_count*sizeof(double*));
    h_d_B = (double**)malloc(batch_count*sizeof(double*));
    h_d_C = (double**)malloc(batch_count*sizeof(double*));

    for(i=0; i<batch_count; i++) {
        cudaMalloc((void**)&h_d_A[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&h_d_B[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&h_d_C[i], dim*dim*sizeof(double));
    }
    // Copy the host array of device pointers to the device
    cudaMalloc((void**)&d_A, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_B, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_C, batch_count*sizeof(double*));
    cudaMemcpy(d_A, h_d_A, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_d_B, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_d_C, batch_count*sizeof(double*), cudaMemcpyHostToDevice);

    // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2
    // Matrices are arranged column major
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                if(i==j) {
                    (A[k])[index] = k*sin(index);
                    (B[k])[index] = sin(index);
                    (C[k])[index] = k*cos(index)*cos(index);
                }
		else {
                    (A[k])[index] = 0.0;
                    (B[k])[index] = 0.0;
                    (C[k])[index] = 0.0;
                }
            } // i   
        } // j
    } // k

    // Create cublas instance
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set input matrices on device
    for(i=0; i<batch_count; i++) {
        cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, h_d_A[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, h_d_B[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, h_d_C[i], dim);
    }

    // Set matrix coefficients
    double alpha = 1.0;
    double beta  = 1.0;

    // DGEMM: C = alpha*A*B + beta*C
    cublasDgemmBatched(handle,
                       CUBLAS_OP_N, CUBLAS_OP_N,
                       dim, dim, dim,
                       &alpha,
                       (const double**)d_A, dim,
                       (const double**)d_B, dim,
                       &beta,
                       d_C, dim,
                       batch_count);

    // Retrieve result matrix from device
    for(i=0; i<batch_count; i++)
        cublasGetMatrix(dim, dim, sizeof(double), h_d_C[i], dim, C[i], dim);

    // Simple sanity test, sum up all elements
    double sum = 0;
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                sum += (C[k])[index];
            }
        }
    }
    printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2);   

    // Clean up resources

    for(i=0; i<batch_count; i++) {
        free(A[i]);
        free(B[i]);
        free(C[i]);
        cudaFree(h_d_A[i]);
        cudaFree(h_d_B[i]);
        cudaFree(h_d_C[i]);
    }

    free(A);
    free(B);
    free(C);
    free(h_d_A);
    free(h_d_B);
    free(h_d_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);

    return 0;
}

Compile:

$ module switch PrgEnv-pgi PrgEnv-gnu
$ module load cudatoolkit
$ cc -lcublas cublas-batch.c -o cublas-batch.out

Run:

$ module load cudatoolkit
$ aprun ./cublas-batch.out

CUDA Streams in C

Not all library functions have a batch equivalent, in this case using CUDA streams may be appropriate. Each stream can be considered a seperate work queue, allowing kernels from separate streams to be executed concurrently if hardware resources allow. In the code sample below we place each DGEMM operation in it’s own stream, allowing up to 32 concurrent operations through the use of Hyper-Q. The cuBLAS library provides functionality to create, set, and destroy CUDA streams.

cublas-stream.c

#include <stdio.h>
#include <stdlib.h>
#include "math.h"
#include "cublas_v2.h"

int main(int argc, char* argv[])
{
    int i,j,k,index;

    // Linear dimension of matrices
    int dim = 100;

    // Number of A,B,C matrix sets
    int batch_count = 1000;

    // Allocate host storage for batch_count A,B,C square matrices
    double **A, **B, **C;
    A = (double**)malloc(batch_count*sizeof(double*));
    B = (double**)malloc(batch_count*sizeof(double*));
    C = (double**)malloc(batch_count*sizeof(double*));
    for(i=0; i<batch_count; i++) {
        A[i] = (double*)malloc(dim*dim*sizeof(double));
        B[i] = (double*)malloc(dim*dim*sizeof(double));
        C[i] = (double*)malloc(dim*dim*sizeof(double));
    }

    // Allocate device storage for batch_count A,B,C
    double **d_A, **d_B, **d_C;
    d_A = (double**)malloc(batch_count*sizeof(double*));
    d_B = (double**)malloc(batch_count*sizeof(double*));
    d_C = (double**)malloc(batch_count*sizeof(double*));
    for(i=0; i<batch_count; i++) {
        cudaMalloc((void**)&d_A[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&d_B[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&d_C[i], dim*dim*sizeof(double));
    }
    // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2
    // Matrices are arranged column major
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                if(i==j) {
                    (A[k])[index] = k*sin(index);
                    (B[k])[index] = sin(index);
                    (C[k])[index] = k*cos(index)*cos(index);
                }
		else {
                    (A[k])[index] = 0.0;
                    (B[k])[index] = 0.0;
                    (C[k])[index] = 0.0;
                }
            } // i   
        } // j
    } // k

    // Create cublas instance
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set input matrices on device
    for(i=0; i<batch_count; i++) {
        cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, d_A[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, d_B[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, d_C[i], dim);
    }

    // Create a stream for every DGEMM operation
    cudaStream_t *streams = (cudaStream_t *) malloc(batch_count*sizeof(cudaStream_t));
    for(i=0; i<batch_count; i++)
        cudaStreamCreate(&streams[i]);

    // Set matrix coefficients
    double alpha = 1.0;
    double beta  = 1.0;

    // Launch each DGEMM operation in own CUDA stream
    for(i=0; i<batch_count; i++){
        // Set CUDA stream
        cublasSetStream(handle, streams[i]);

        // DGEMM: C = alpha*A*B + beta*C
        cublasDgemm(handle,
                    CUBLAS_OP_N, CUBLAS_OP_N,
                    dim, dim, dim,
                    &alpha,
                    d_A[i], dim,
                    d_B[i], dim,
                    &beta,
                    d_C[i], dim);
    }

    // Retrieve result matrix from device
    for(i=0; i<batch_count; i++) {
        cublasGetMatrix(dim, dim, sizeof(double), d_C[i], dim, C[i], dim);
    }

    // Simple sanity test, sum up all elements
    double sum = 0;
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                sum += (C[k])[index];
            }
        }
    }
    printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2);   

    // Clean up resources
    for(i=0; i<batch_count; i++) {
        free(A[i]);
        free(B[i]);
        free(C[i]);
        cudaFree(d_A[i]);
        cudaFree(d_B[i]);
        cudaFree(d_C[i]);
        cudaStreamDestroy(streams[i]);
    }
    free(A);
    free(B);
    free(C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);

    return 0;
}

Compile:

$ module switch PrgEnv-pgi PrgEnv-gnu
$ module load cudatoolkit
$ cc -lcublas cublas-stream.c -o cublas-stream.out

Run:

$ module load cudatoolkit
$ aprun ./cublas-stream.out

Fortran

Wrappers

NVIDIA provides a fortran interface for the legacy cuBLAS API although they do not provide a fortran interface to the version 2 API, which adds batched operations. One of the changes introduced with the new API is that an opaque cuBLAS handle must be be passed into most function calls; while this provides greater control over operations it is a hinderance to interoperability. The wrapper is split into two pieces: cublas_fortran.cu and cublas_fortran_iso.f90. cublas_fortran.cu provides a lightweight C wrapper around cuBLAS calls, with most calls taking a pointer to the cuBLAS handle that can be dereferenced to make the actual call. Since the wrapper functions require only a pointer to the cuBLAS handle we may interface them with the ISO_C_BINDING module. This ISO_C_BINDING interface is the second piece to our wrapper, cublas_fortran_iso.f90, and allows for robust interoperability between C and Fortran types.

cublas_fortran.cu

#include <stdio.h>
#include "cublas_v2.h"

extern "C" int f_cublasCreate(cublasHandle_t **handle)
{
    *handle = (cublasHandle_t*)malloc(sizeof(cublasHandle_t));
    return cublasCreate(*handle);
}

extern "C" int f_cublasDgemm(cublasHandle_t *handle,
               cublasOperation_t transa, cublasOperation_t transb, 
              int m, int n, int k, 
              const double *alpha,
              const double *A, int lda, 
              const double *B, int ldb,
              const double *beta, 
              double *C, int ldc)
{
    return cublasDgemm(*handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
}

extern "C" int f_cublasDgemmBatched(cublasHandle_t *handle,
               cublasOperation_t transa, cublasOperation_t transb,
              int m, int n, int k,
              const double *alpha,
              const double **A, int lda,
              const double **B, int ldb,
              const double *beta,
              double **C, int ldc,
              int batch_count)
{
    return cublasDgemmBatched(*handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,batch_count);
}

extern "C" void f_cublasDestroy(cublasHandle_t *handle)
{
    cublasDestroy(*handle);
    free(handle);
}

extern "C" int f_cudaStreamCreate(cudaStream_t **stream)
{
    *stream = (cudaStream_t *) malloc(sizeof(cudaStream_t));
    return cudaStreamCreate(*stream);
}

extern "C" int f_cublasSetStream(cublasHandle_t *handle, cudaStream_t *streamid)
{
    return cublasSetStream(*handle, *streamid);
}

extern "C" void f_cudaStreamDestroy(cudaStream_t *stream)
{
    cudaStreamDestroy(*stream);
}

cublas_fortran_iso.f90

module cublas_f
  use ISO_C_BINDING

    enum, BIND(C)
        enumerator :: CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C
    end enum
   
    enum, BIND(C)
        enumerator :: cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, &
                      cudaMemcpyDeviceToDevice, cudaMemcpyDefault
    end enum

  INTERFACE
    integer(C_INT) function cudaMalloc(ptr, bytes) BIND(C, NAME='cudaMalloc')
        use ISO_C_BINDING
        type(C_PTR) :: ptr
        integer(C_SIZE_T), value :: bytes
    end function

    integer(C_INT) function cudaMemcpy(dst, src, count, kind) BIND(C, NAME='cudaMemcpy')
        use ISO_C_BINDING
        type(C_PTR), value :: dst
        type(C_PTR), value :: src
        integer(C_SIZE_T), value :: count
        integer(C_INT), value :: kind
    end function

    integer(C_INT) function cublasCreate(handle_ptr) BIND(C, NAME='f_cublasCreate')
        use ISO_C_BINDING
        type(C_PTR) :: handle_ptr
    end function

    subroutine  cublasDestroy(handle_ptr) BIND(C, NAME='f_cublasDestroy')
        use ISO_C_BINDING
        type(C_PTR), value :: handle_ptr
    end subroutine

    subroutine cudaFree(ptr) BIND(C, NAME='cudaFree')
        use ISO_C_BINDING
        type(C_PTR), value :: ptr
    end subroutine

    integer(C_INT) function cublasSetMatrix(rows, cols, elemSize, a_ptr, &
                                           lda, b_ptr, ldb) &
                                           BIND(C, NAME='cublasSetMatrix')
        use ISO_C_BINDING
        integer(C_INT), value :: rows
        integer(C_INT), value :: cols
        integer(C_INT), value :: elemSize
        type(C_PTR),    value :: a_ptr
        integer(C_INT), value :: lda
        type(C_PTR),    value :: b_ptr
        integer(C_INT), value :: ldb
    end function

    integer(C_INT) function cublasGetMatrix(rows, cols, elemSize, a_ptr, &
                                            lda, b_ptr, ldb ) &
                                            BIND(C, NAME='cublasGetMatrix')
        use ISO_C_BINDING
        integer(C_INT), value :: rows
        integer(C_INT), value :: cols
        integer(C_INT), value :: elemSize
        type(C_PTR),    value :: a_ptr
        integer(C_INT), value :: lda
        type(C_PTR),    value :: b_ptr
        integer(C_INT), value :: ldb

    end function

    integer(C_INT) function cublasDgemm(handle, transa, transb, m, n, k, alpha, &
                                        A, lda, B, ldb, beta, C, ldc) &
                                        BIND(C, NAME='f_cublasDgemm')
        use ISO_C_BINDING
        type(C_PTR), value    :: handle
        integer(C_INT), value :: transa
        integer(C_INT), value :: transb
        integer(C_INT), value :: m
        integer(C_INT), value :: n
        integer(C_INT), value :: k
        real(C_DOUBLE)        :: alpha
        type(C_PTR), value    :: A
        integer(C_INT), value :: lda
        type(C_PTR), value    :: B
        integer(C_INT), value :: ldb
        real(C_DOUBLE)        :: beta
        type(C_PTR), value    :: C
        integer(C_INT), value :: ldc
    end function

    integer(C_INT) function cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, &
                                        A, lda, B, ldb, beta, C, ldc, batch_count) &
                                        BIND(C, NAME='f_cublasDgemmBatched')
        use ISO_C_BINDING
        type(C_PTR), value    :: handle
        integer(C_INT), value :: transa
        integer(C_INT), value :: transb
        integer(C_INT), value :: m
        integer(C_INT), value :: n
        integer(C_INT), value :: k
        real(C_DOUBLE)        :: alpha
        type(C_PTR), value    :: A
        integer(C_INT), value :: lda
        type(C_PTR), value    :: B
        integer(C_INT), value :: ldb
        real(C_DOUBLE)        :: beta
        type(C_PTR), value    :: C
        integer(C_INT), value :: ldc
        integer(C_INT), value :: batch_count
    end function

    integer(C_INT) function cudaStreamCreate(stream_ptr) BIND(C, NAME='f_cudaStreamCreate')
        use ISO_C_BINDING
        type(C_PTR) :: stream_ptr
    end function

    integer(C_INT) function cublasSetStream(handle, stream) BIND(C, NAME='f_cublasSetStream')
        use ISO_C_BINDING
        type(C_PTR), value :: handle
        type(C_PTR), value :: stream
    end function

    integer(C_INT) function cudaStreamDestroy(stream) BIND(C, NAME='f_cudaStreamDestroy')
        use ISO_C_BINDING
        type(C_PTR), value :: stream
    end function

    subroutine cudaDeviceSynchronize() BIND(C, NAME='cudaDeviceSynchronize')
    end subroutine

  END INTERFACE

a end module cublas_f

A Serial Example in Fortran

The following example preforms a single DGEMM operation using the cuBLAS version 2 library. This example will be expanded to show the use of batched cuBLAS calls under a variety of situations. The form of our sample DGEMM operation will be as follows:

DGEMM

Note: cuBLAS uses column major ordering and 1-based indexing to maintain legacy Fortran support.

cublas.f90

program main
    use ISO_C_BINDING
    use cublas_f
    implicit none

    integer :: dim, stat, i, j
    integer(8) :: bytes
    real(8),dimension(:,:), pointer  :: A, B, C
    real(8) :: alpha, beta, index, sum
    type(C_PTR) :: d_A, d_B, d_C
    type(C_PTR) :: handle

    integer :: sizeof_double
    parameter (sizeof_double=8)

    !Linear dimension of matrices
    dim = 100

    ! Allocate host storage for A,B,C square matrices
    allocate(A(dim,dim))
    allocate(B(dim,dim))
    allocate(C(dim,dim))

    ! Allocate device storage for A,B,C
    bytes = int(dim*dim*sizeof_double, 8) 
    stat = cudaMalloc(d_A, bytes)
    stat = cudaMalloc(d_B, bytes)
    stat = cudaMalloc(d_C, bytes)

    ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2
    ! Matrices are arranged column major
    do j=1,dim
        do i=1,dim
            if (i==j) then
                index = real(j*dim + i)
                A(i,j) = sin(index)
                B(i,j) = sin(index)
                C(i,j) = cos(index) * cos(index)
            else
                A(i,j) = 0.0
                B(i,j) = 0.0
                C(i,j) = 0.0
            endif
        enddo
    enddo

    ! Create cublas instance
    stat = cublasCreate(handle)

    ! Set input matrices on device
    stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(A(1,1)), dim, d_A, dim)
    stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(B(1,1)), dim, d_B, dim)
    stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(C(1,1)), dim, d_C, dim)

    ! DGEMM: C = alpha*A*B + beta*C
    alpha = 1.0
    beta = 1.0
    stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
                       dim, dim, dim, &
                       alpha,         &
                       d_A, dim,      &
                       d_B, dim,      &
                       beta,          &
                       d_C, dim)

    ! Retrieve result matrix from device
    stat = cublasGetMatrix(dim, dim, sizeof_double, d_C, dim, C_LOC(C(1,1)), dim)

    ! Simple sanity test, sum up all elements
    sum = 0.0
    do j=1,dim
        do i=1,dim
            sum = sum + C(i,j)
        enddo
    enddo
    print *, "Sum is:", sum, "should be: ", dim

    deallocate(A)
    deallocate(B)
    deallocate(C)
    call cudaFree(d_A)
    call cudaFree(d_B)
    call cudaFree(d_C)
    call cublasDestroy(handle)

end program
For simplicity the following compiles will use the GNU programming environment. For more information on using non GNU compilers please see the Compiling Mixed GPU and CPU code tutorial

Compile:

$ module switch PrgEnv-pgi PrgEnv-gnu
$ module load cudatoolkit
$ nvcc -c cublas_fortran.cu
$ ftn -lcublas cublas_fortran_iso.f90 cublas.f90 cublas_fortran.o -o cublas.out

Run:

$ module load cudatoolkit
$ aprun ./cublas.out

Library-provided Batching in Fortran

The most efficient way to increase GPU utilization for small library operations is to use library provided batch capabilities. In the code sample below we use cublasDgemmBatched, which takes as an argument an array of pointers to matrices and the number of DGEMM operations to preform in total. By allowing the library to handle the batching high performance can be achieved with minimal additional effort on the part of the programmer.

cublas-batch.f90

program main
    use ISO_C_BINDING
    use cublas_f
    implicit none

    integer :: dim, stat, i, j, k, batch_count
    integer(8) :: bytes
    real(8),dimension(:,:,:), pointer:: A, B, C
    real(8) :: alpha, beta, index, sum
    type(C_PTR) :: d_A, d_B, d_C
    type(C_PTR), dimension(:), pointer :: h_d_A, h_d_B, h_d_C, streams
    type(C_PTR) :: handle

    integer :: sizeof_double
    parameter (sizeof_double=8)

    !Linear dimension of matrices
    dim = 100

    ! Number of A,B,C matrix sets
    batch_count = 1000

    ! Allocate host storage for A,B,C square matrices
    allocate(A(dim,dim,batch_count))
    allocate(B(dim,dim,batch_count))
    allocate(C(dim,dim,batch_count))

    ! Create host pointer array to device matrix storage
    allocate(h_d_A(batch_count))
    allocate(h_d_B(batch_count))
    allocate(h_d_C(batch_count))
    bytes = dim*dim*sizeof_double

    do i=1,batch_count
        stat = cudaMalloc(h_d_A(i), bytes)
        stat = cudaMalloc(h_d_B(i), bytes)
        stat = cudaMalloc(h_d_C(i), bytes)
    enddo

    ! Copy the host array of device pointers to the device
    bytes = batch_count*sizeof_double ! 8 byte pointers
    stat = cudaMalloc(d_A, bytes)
    stat = cudaMalloc(d_B, bytes)
    stat = cudaMalloc(d_C, bytes)

    stat = cudaMemcpy(d_A, C_LOC(h_d_A(1)), bytes, cudaMemcpyHostToDevice);
    stat = cudaMemcpy(d_B, C_LOC(h_d_B(1)), bytes, cudaMemcpyHostToDevice);
    stat = cudaMemcpy(d_C, C_LOC(h_d_C(1)), bytes, cudaMemcpyHostToDevice);

    ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2
    ! Matrices are arranged column major
    do k=1,batch_count
        do j=1,dim
            do i=1,dim
                if (i==j) then
                    index = real(j*dim + i)
                    A(i,j,k) = k*sin(index)
                    B(i,j,k) = sin(index)
                    C(i,j,k) = k*cos(index) * cos(index)
                else
                    A(i,j,k) = 0.0
                    B(i,j,k) = 0.0
                    C(i,j,k) = 0.0
                endif
            enddo ! i
        enddo ! j
    enddo ! k

    ! Create cublas instance
    stat = cublasCreate(handle)

    ! Set input matrices on device
    do i=1,batch_count
        stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(A(1,1,i)), dim, h_d_A(i), dim)
        stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(B(1,1,i)), dim, h_d_B(i), dim)
        stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(C(1,1,i)), dim, h_d_C(i), dim)
    enddo

    ! Set matrix coefficients
    alpha = 1.0
    beta = 1.0

    ! batched DGEMM: C = alpha*A*B + beta*C
    stat = cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
                       dim, dim, dim, &
                       alpha,         &
                       d_A, dim,      &
                       d_B, dim,      &
                       beta,          &
                       d_C, dim,      &
                       batch_count)

    ! Retrieve result matrix from device
    do i=1,batch_count
        stat = cublasGetMatrix(dim, dim, sizeof_double, h_d_C(i), dim, C_LOC(C(1,1,i)), dim)
    enddo

    ! Simple sanity test, sum up all elements
    sum = 0.0
    do k=1,batch_count
        do j=1,dim
            do i=1,dim
                sum = sum + C(i,j,k)
            enddo
        enddo
    enddo
    print *, "Sum is:", sum, "should be: ", dim*(batch_count)*(batch_count+1)/2

    do i=1,batch_count
        call cudaFree(h_d_A(i))
        call cudaFree(h_d_B(i))
        call cudaFree(h_d_C(i))
    enddo

    deallocate(A)
    deallocate(B)
    deallocate(C)
    call cublasDestroy(handle)

end program

Compile:

$ module switch PrgEnv-pgi PrgEnv-gnu
$ module load cudatoolkit
$ nvcc -c cublas_fortran.cu
$ ftn -lcublas cublas_fortran_iso.f90 cublas-batch.f90 cublas_fortran.o -o cublas-batch.out

Run:

$ module load cudatoolkit
$ aprun ./cublas-batch.out

CUDA Streams in Fortran

Not all library functions have a batch equivalent, in this case using CUDA streams may be appropriate. Each stream can be considered a seperate work queue, allowing kernels from separate streams to be executed concurrently if hardware resources allow. In the code sample below we place each DGEMM operation in it’s own stream, Allowing up to 32 concurrent operations through the use of Hyper-Q. The cuBLAS library provides functionality to create, set, and destroy CUDA streams.

cublas-stream.f90

program main
    use ISO_C_BINDING
    use cublas_f
    implicit none

    integer :: dim, stat, i, j, k, batch_count
    integer(8) :: bytes
    real(8),dimension(:,:,:), pointer:: A, B, C
    real(8) :: alpha, beta, index, sum
    type(C_PTR), dimension(:), pointer :: d_A, d_B, d_C, streams
    type(C_PTR) :: handle

    integer :: sizeof_double
    parameter (sizeof_double=8)

    !Linear dimension of matrices
    dim = 100

    ! Number of A,B,C matrix sets
    batch_count = 1000

    ! Allocate host storage for A,B,C square matrices
    allocate(A(dim,dim,batch_count))
    allocate(B(dim,dim,batch_count))
    allocate(C(dim,dim,batch_count))

    ! Allocate device storage for A,B,C
    allocate(d_A(batch_count))
    allocate(d_B(batch_count))
    allocate(d_C(batch_count))
    bytes = int(dim*dim*sizeof_double, 8)
    do i=1,batch_count
        stat = cudaMalloc(d_A(i), bytes)
        stat = cudaMalloc(d_B(i), bytes)
        stat = cudaMalloc(d_C(i), bytes)
    enddo

    ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2
    ! Matrices are arranged column major
    do k=1,batch_count
        do j=1,dim
            do i=1,dim
                if (i==j) then
                    index = real(j*dim + i)
                    A(i,j,k) = k*sin(index)
                    B(i,j,k) = sin(index)
                    C(i,j,k) = k*cos(index) * cos(index)
                else
                    A(i,j,k) = 0.0
                    B(i,j,k) = 0.0
                    C(i,j,k) = 0.0
                endif
            enddo ! i
        enddo ! j
    enddo ! k

    ! Create cublas instance
    stat = cublasCreate(handle)

    ! Set input matrices on device
    do i=1,batch_count
        stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(A(1,1,i)), dim, d_A(i), dim)
        stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(B(1,1,i)), dim, d_B(i), dim)
        stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(C(1,1,i)), dim, d_C(i), dim)
    enddo

    ! Create a stream for every DGEMM operation
    allocate(streams(batch_count))
    do i=1,batch_count
        stat = cudaStreamCreate(streams(i))
    enddo

    ! Set matrix coefficients
    alpha = 1.0
    beta = 1.0

    ! Launch each DGEMM operation in own CUDA stream
    do i=1,batch_count
        ! Set CUDA stream
        stat = cublasSetStream(handle, streams(i))

        ! DGEMM: C = alpha*A*B + beta*C
        stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, &
                           dim, dim, dim, &
                           alpha,         &
                           d_A(i), dim,   &
                           d_B(i), dim,   &
                           beta,          &
                           d_C(i), dim)
    enddo

    ! Retrieve result matrix from device
    do i=1,batch_count
        stat = cublasGetMatrix(dim, dim, sizeof_double, d_C(i), dim, C_LOC(C(1,1,i)), dim)
    enddo

    ! Simple sanity test, sum up all elements
    sum = 0.0
    do k=1,batch_count
        do j=1,dim
            do i=1,dim
                sum = sum + C(i,j,k)
            enddo
        enddo
    enddo
    print *, "Sum is:", sum, "should be: ", dim*(batch_count)*(batch_count+1)/2

    do i=1,batch_count
        stat = cudaStreamDestroy(streams(i))
        call cudaFree(d_A(i))
        call cudaFree(d_B(i))
        call cudaFree(d_C(i))
    enddo

    deallocate(A)
    deallocate(B)
    deallocate(C)
    call cublasDestroy(handle)

end program

Compile:

$ module switch PrgEnv-pgi PrgEnv-gnu
$ module load cudatoolkit
$ nvcc -c cublas_fortran.cu
$ ftn -lcublas cublas_fortran_iso.f90 cublas-stream.f90 cublas_fortran.o -o cublas-stream.out

Run:

$ module load cudatoolkit
$ aprun ./cublas-stream.out