To fully exploit the computational power of the GPU generally a large amount of data parallelism must be expressed. In the specific case of accelerated libraries such as cuBLAS
, cuFFT
, and cuSPARSE
if each operation does not possess a sufficient amount of data parallelism another option is to batch many smaller operations into a single large operation. This tutorial will demonstrate how to take advantage of batched cuBLAS
operations to improve GPU utilization. Additionally this tutorial will expand on the GPU concurrency topics from the first tutorial through the use of streams
and Hyper-Q
. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov
C
A Serial Example in C
The following example preforms a single DGEMM
operation using the cuBLAS
version 2 library. This example will be expanded to show the use of batched cuBLAS
calls under a variety of situations. The form of our sample DGEMM
operation will be as follows:
cublas.c
#include <stdio.h> #include <stdlib.h> #include "math.h" #include "cublas_v2.h" int main(int argc, char* argv[]) { // Linear dimension of matrices int dim = 100; // Allocate host storage for A,B,C square matrices double *A, *B, *C; A = (double*)malloc(dim*dim*sizeof(double)); B = (double*)malloc(dim*dim*sizeof(double)); C = (double*)malloc(dim*dim*sizeof(double)); // Allocate device storage for A,B,C double *d_A, *d_B, *d_C; cudaMalloc((void**)&d_A, dim*dim*sizeof(double)); cudaMalloc((void**)&d_B, dim*dim*sizeof(double)); cudaMalloc((void**)&d_C, dim*dim*sizeof(double)); // Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2 // Matrices are arranged column major int i,j,index; for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; if(i==j) { A[index] = sin(index); B[index] = sin(index); C[index] = cos(index)*cos(index); } else { A[index] = 0.0; B[index] = 0.0; C[index] = 0.0; } } } // Create cublas instance cublasHandle_t handle; cublasCreate(&handle); // Set input matrices on device cublasSetMatrix(dim, dim, sizeof(double), A, dim, d_A, dim); cublasSetMatrix(dim, dim, sizeof(double), B, dim, d_B, dim); cublasSetMatrix(dim, dim, sizeof(double), C, dim, d_C, dim); // DGEMM: C = alpha*A*B + beta*C double alpha = 1.0; double beta = 1.0; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, d_A, dim, d_B, dim, &beta, d_C, dim); // Retrieve result matrix from device cublasGetMatrix(dim, dim, sizeof(double), d_C, dim, C, dim); // Simple sanity test, sum up all elements double sum = 0; for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; sum += C[index]; } } printf("Sum is: %f, should be: %d\n", sum, dim); // Clean up resources free(A); free(B); free(C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); return 0; }
Compile:
$ module switch PrgEnv-pgi PrgEnv-gnu $ module load cudatoolkit $ cc -lcublas cublas.c -o cublas.out
Run:
$ module load cudatoolkit $ aprun ./cublas.out
Library-provided Batching in C
The most efficient way to increase GPU utilization for small library operations is to use library provided batch capabilities. In the code sample below we use cublasDgemmBatched
, which takes as an argument an array of pointers to matrices and the number of DGEMM
operations to preform in total. By allowing the library to handle the batching, high performance can be achieved with minimal additional effort on the part of the programmer.
cublas-batch.c
#include <stdio.h> #include <stdlib.h> #include "math.h" #include "cublas_v2.h" int main(int argc, char* argv[]) { int i,j,k,index; // Linear dimension of matrices int dim = 100; // Number of A,B,C matrix sets int batch_count = 1000; // Allocate host storage for batch_count A,B,C square matrices double **A, **B, **C; A = (double**)malloc(batch_count*sizeof(double*)); B = (double**)malloc(batch_count*sizeof(double*)); C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { A[i] = (double*)malloc(dim*dim*sizeof(double)); B[i] = (double*)malloc(dim*dim*sizeof(double)); C[i] = (double*)malloc(dim*dim*sizeof(double)); } // Create host pointer array to device matrix storage double **d_A, **d_B, **d_C, **h_d_A, **h_d_B, **h_d_C; h_d_A = (double**)malloc(batch_count*sizeof(double*)); h_d_B = (double**)malloc(batch_count*sizeof(double*)); h_d_C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { cudaMalloc((void**)&h_d_A[i], dim*dim*sizeof(double)); cudaMalloc((void**)&h_d_B[i], dim*dim*sizeof(double)); cudaMalloc((void**)&h_d_C[i], dim*dim*sizeof(double)); } // Copy the host array of device pointers to the device cudaMalloc((void**)&d_A, batch_count*sizeof(double*)); cudaMalloc((void**)&d_B, batch_count*sizeof(double*)); cudaMalloc((void**)&d_C, batch_count*sizeof(double*)); cudaMemcpy(d_A, h_d_A, batch_count*sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_d_B, batch_count*sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_C, h_d_C, batch_count*sizeof(double*), cudaMemcpyHostToDevice); // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2 // Matrices are arranged column major for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; if(i==j) { (A[k])[index] = k*sin(index); (B[k])[index] = sin(index); (C[k])[index] = k*cos(index)*cos(index); } else { (A[k])[index] = 0.0; (B[k])[index] = 0.0; (C[k])[index] = 0.0; } } // i } // j } // k // Create cublas instance cublasHandle_t handle; cublasCreate(&handle); // Set input matrices on device for(i=0; i<batch_count; i++) { cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, h_d_A[i], dim); cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, h_d_B[i], dim); cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, h_d_C[i], dim); } // Set matrix coefficients double alpha = 1.0; double beta = 1.0; // DGEMM: C = alpha*A*B + beta*C cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, (const double**)d_A, dim, (const double**)d_B, dim, &beta, d_C, dim, batch_count); // Retrieve result matrix from device for(i=0; i<batch_count; i++) cublasGetMatrix(dim, dim, sizeof(double), h_d_C[i], dim, C[i], dim); // Simple sanity test, sum up all elements double sum = 0; for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; sum += (C[k])[index]; } } } printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2); // Clean up resources for(i=0; i<batch_count; i++) { free(A[i]); free(B[i]); free(C[i]); cudaFree(h_d_A[i]); cudaFree(h_d_B[i]); cudaFree(h_d_C[i]); } free(A); free(B); free(C); free(h_d_A); free(h_d_B); free(h_d_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); return 0; }
Compile:
$ module switch PrgEnv-pgi PrgEnv-gnu $ module load cudatoolkit $ cc -lcublas cublas-batch.c -o cublas-batch.out
Run:
$ module load cudatoolkit $ aprun ./cublas-batch.out
CUDA Streams in C
Not all library functions have a batch equivalent, in this case using CUDA streams
may be appropriate. Each stream can be considered a seperate work queue, allowing kernels from separate streams to be executed concurrently if hardware resources allow. In the code sample below we place each DGEMM
operation in it’s own stream
, allowing up to 32 concurrent operations through the use of Hyper-Q
. The cuBLAS
library provides functionality to create, set, and destroy CUDA streams
.
cublas-stream.c
#include <stdio.h> #include <stdlib.h> #include "math.h" #include "cublas_v2.h" int main(int argc, char* argv[]) { int i,j,k,index; // Linear dimension of matrices int dim = 100; // Number of A,B,C matrix sets int batch_count = 1000; // Allocate host storage for batch_count A,B,C square matrices double **A, **B, **C; A = (double**)malloc(batch_count*sizeof(double*)); B = (double**)malloc(batch_count*sizeof(double*)); C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { A[i] = (double*)malloc(dim*dim*sizeof(double)); B[i] = (double*)malloc(dim*dim*sizeof(double)); C[i] = (double*)malloc(dim*dim*sizeof(double)); } // Allocate device storage for batch_count A,B,C double **d_A, **d_B, **d_C; d_A = (double**)malloc(batch_count*sizeof(double*)); d_B = (double**)malloc(batch_count*sizeof(double*)); d_C = (double**)malloc(batch_count*sizeof(double*)); for(i=0; i<batch_count; i++) { cudaMalloc((void**)&d_A[i], dim*dim*sizeof(double)); cudaMalloc((void**)&d_B[i], dim*dim*sizeof(double)); cudaMalloc((void**)&d_C[i], dim*dim*sizeof(double)); } // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2 // Matrices are arranged column major for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; if(i==j) { (A[k])[index] = k*sin(index); (B[k])[index] = sin(index); (C[k])[index] = k*cos(index)*cos(index); } else { (A[k])[index] = 0.0; (B[k])[index] = 0.0; (C[k])[index] = 0.0; } } // i } // j } // k // Create cublas instance cublasHandle_t handle; cublasCreate(&handle); // Set input matrices on device for(i=0; i<batch_count; i++) { cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, d_A[i], dim); cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, d_B[i], dim); cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, d_C[i], dim); } // Create a stream for every DGEMM operation cudaStream_t *streams = (cudaStream_t *) malloc(batch_count*sizeof(cudaStream_t)); for(i=0; i<batch_count; i++) cudaStreamCreate(&streams[i]); // Set matrix coefficients double alpha = 1.0; double beta = 1.0; // Launch each DGEMM operation in own CUDA stream for(i=0; i<batch_count; i++){ // Set CUDA stream cublasSetStream(handle, streams[i]); // DGEMM: C = alpha*A*B + beta*C cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, d_A[i], dim, d_B[i], dim, &beta, d_C[i], dim); } // Retrieve result matrix from device for(i=0; i<batch_count; i++) { cublasGetMatrix(dim, dim, sizeof(double), d_C[i], dim, C[i], dim); } // Simple sanity test, sum up all elements double sum = 0; for(k=0; k<batch_count; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { index = j*dim + i; sum += (C[k])[index]; } } } printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2); // Clean up resources for(i=0; i<batch_count; i++) { free(A[i]); free(B[i]); free(C[i]); cudaFree(d_A[i]); cudaFree(d_B[i]); cudaFree(d_C[i]); cudaStreamDestroy(streams[i]); } free(A); free(B); free(C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); return 0; }
Compile:
$ module switch PrgEnv-pgi PrgEnv-gnu $ module load cudatoolkit $ cc -lcublas cublas-stream.c -o cublas-stream.out
Run:
$ module load cudatoolkit $ aprun ./cublas-stream.out
Fortran
Wrappers
NVIDIA provides a fortran interface for the legacy cuBLAS
API although they do not provide a fortran interface to the version 2 API, which adds batched operations. One of the changes introduced with the new API is that an opaque cuBLAS
handle must be be passed into most function calls; while this provides greater control over operations it is a hinderance to interoperability. The wrapper is split into two pieces: cublas_fortran.cu
and cublas_fortran_iso.f90
. cublas_fortran.cu
provides a lightweight C
wrapper around cuBLAS
calls, with most calls taking a pointer to the cuBLAS
handle that can be dereferenced to make the actual call. Since the wrapper functions require only a pointer to the cuBLAS
handle we may interface them with the ISO_C_BINDING
module. This ISO_C_BINDING
interface is the second piece to our wrapper, cublas_fortran_iso.f90
, and allows for robust interoperability between C and Fortran types.
cublas_fortran.cu
#include <stdio.h> #include "cublas_v2.h" extern "C" int f_cublasCreate(cublasHandle_t **handle) { *handle = (cublasHandle_t*)malloc(sizeof(cublasHandle_t)); return cublasCreate(*handle); } extern "C" int f_cublasDgemm(cublasHandle_t *handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc) { return cublasDgemm(*handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); } extern "C" int f_cublasDgemmBatched(cublasHandle_t *handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double **A, int lda, const double **B, int ldb, const double *beta, double **C, int ldc, int batch_count) { return cublasDgemmBatched(*handle,transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc,batch_count); } extern "C" void f_cublasDestroy(cublasHandle_t *handle) { cublasDestroy(*handle); free(handle); } extern "C" int f_cudaStreamCreate(cudaStream_t **stream) { *stream = (cudaStream_t *) malloc(sizeof(cudaStream_t)); return cudaStreamCreate(*stream); } extern "C" int f_cublasSetStream(cublasHandle_t *handle, cudaStream_t *streamid) { return cublasSetStream(*handle, *streamid); } extern "C" void f_cudaStreamDestroy(cudaStream_t *stream) { cudaStreamDestroy(*stream); }
cublas_fortran_iso.f90
module cublas_f use ISO_C_BINDING enum, BIND(C) enumerator :: CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C end enum enum, BIND(C) enumerator :: cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, & cudaMemcpyDeviceToDevice, cudaMemcpyDefault end enum INTERFACE integer(C_INT) function cudaMalloc(ptr, bytes) BIND(C, NAME='cudaMalloc') use ISO_C_BINDING type(C_PTR) :: ptr integer(C_SIZE_T), value :: bytes end function integer(C_INT) function cudaMemcpy(dst, src, count, kind) BIND(C, NAME='cudaMemcpy') use ISO_C_BINDING type(C_PTR), value :: dst type(C_PTR), value :: src integer(C_SIZE_T), value :: count integer(C_INT), value :: kind end function integer(C_INT) function cublasCreate(handle_ptr) BIND(C, NAME='f_cublasCreate') use ISO_C_BINDING type(C_PTR) :: handle_ptr end function subroutine cublasDestroy(handle_ptr) BIND(C, NAME='f_cublasDestroy') use ISO_C_BINDING type(C_PTR), value :: handle_ptr end subroutine subroutine cudaFree(ptr) BIND(C, NAME='cudaFree') use ISO_C_BINDING type(C_PTR), value :: ptr end subroutine integer(C_INT) function cublasSetMatrix(rows, cols, elemSize, a_ptr, & lda, b_ptr, ldb) & BIND(C, NAME='cublasSetMatrix') use ISO_C_BINDING integer(C_INT), value :: rows integer(C_INT), value :: cols integer(C_INT), value :: elemSize type(C_PTR), value :: a_ptr integer(C_INT), value :: lda type(C_PTR), value :: b_ptr integer(C_INT), value :: ldb end function integer(C_INT) function cublasGetMatrix(rows, cols, elemSize, a_ptr, & lda, b_ptr, ldb ) & BIND(C, NAME='cublasGetMatrix') use ISO_C_BINDING integer(C_INT), value :: rows integer(C_INT), value :: cols integer(C_INT), value :: elemSize type(C_PTR), value :: a_ptr integer(C_INT), value :: lda type(C_PTR), value :: b_ptr integer(C_INT), value :: ldb end function integer(C_INT) function cublasDgemm(handle, transa, transb, m, n, k, alpha, & A, lda, B, ldb, beta, C, ldc) & BIND(C, NAME='f_cublasDgemm') use ISO_C_BINDING type(C_PTR), value :: handle integer(C_INT), value :: transa integer(C_INT), value :: transb integer(C_INT), value :: m integer(C_INT), value :: n integer(C_INT), value :: k real(C_DOUBLE) :: alpha type(C_PTR), value :: A integer(C_INT), value :: lda type(C_PTR), value :: B integer(C_INT), value :: ldb real(C_DOUBLE) :: beta type(C_PTR), value :: C integer(C_INT), value :: ldc end function integer(C_INT) function cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, & A, lda, B, ldb, beta, C, ldc, batch_count) & BIND(C, NAME='f_cublasDgemmBatched') use ISO_C_BINDING type(C_PTR), value :: handle integer(C_INT), value :: transa integer(C_INT), value :: transb integer(C_INT), value :: m integer(C_INT), value :: n integer(C_INT), value :: k real(C_DOUBLE) :: alpha type(C_PTR), value :: A integer(C_INT), value :: lda type(C_PTR), value :: B integer(C_INT), value :: ldb real(C_DOUBLE) :: beta type(C_PTR), value :: C integer(C_INT), value :: ldc integer(C_INT), value :: batch_count end function integer(C_INT) function cudaStreamCreate(stream_ptr) BIND(C, NAME='f_cudaStreamCreate') use ISO_C_BINDING type(C_PTR) :: stream_ptr end function integer(C_INT) function cublasSetStream(handle, stream) BIND(C, NAME='f_cublasSetStream') use ISO_C_BINDING type(C_PTR), value :: handle type(C_PTR), value :: stream end function integer(C_INT) function cudaStreamDestroy(stream) BIND(C, NAME='f_cudaStreamDestroy') use ISO_C_BINDING type(C_PTR), value :: stream end function subroutine cudaDeviceSynchronize() BIND(C, NAME='cudaDeviceSynchronize') end subroutine END INTERFACE a end module cublas_f
A Serial Example in Fortran
The following example preforms a single DGEMM
operation using the cuBLAS
version 2 library. This example will be expanded to show the use of batched cuBLAS
calls under a variety of situations. The form of our sample DGEMM
operation will be as follows:
cublas.f90
program main use ISO_C_BINDING use cublas_f implicit none integer :: dim, stat, i, j integer(8) :: bytes real(8),dimension(:,:), pointer :: A, B, C real(8) :: alpha, beta, index, sum type(C_PTR) :: d_A, d_B, d_C type(C_PTR) :: handle integer :: sizeof_double parameter (sizeof_double=8) !Linear dimension of matrices dim = 100 ! Allocate host storage for A,B,C square matrices allocate(A(dim,dim)) allocate(B(dim,dim)) allocate(C(dim,dim)) ! Allocate device storage for A,B,C bytes = int(dim*dim*sizeof_double, 8) stat = cudaMalloc(d_A, bytes) stat = cudaMalloc(d_B, bytes) stat = cudaMalloc(d_C, bytes) ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2 ! Matrices are arranged column major do j=1,dim do i=1,dim if (i==j) then index = real(j*dim + i) A(i,j) = sin(index) B(i,j) = sin(index) C(i,j) = cos(index) * cos(index) else A(i,j) = 0.0 B(i,j) = 0.0 C(i,j) = 0.0 endif enddo enddo ! Create cublas instance stat = cublasCreate(handle) ! Set input matrices on device stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(A(1,1)), dim, d_A, dim) stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(B(1,1)), dim, d_B, dim) stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(C(1,1)), dim, d_C, dim) ! DGEMM: C = alpha*A*B + beta*C alpha = 1.0 beta = 1.0 stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, & dim, dim, dim, & alpha, & d_A, dim, & d_B, dim, & beta, & d_C, dim) ! Retrieve result matrix from device stat = cublasGetMatrix(dim, dim, sizeof_double, d_C, dim, C_LOC(C(1,1)), dim) ! Simple sanity test, sum up all elements sum = 0.0 do j=1,dim do i=1,dim sum = sum + C(i,j) enddo enddo print *, "Sum is:", sum, "should be: ", dim deallocate(A) deallocate(B) deallocate(C) call cudaFree(d_A) call cudaFree(d_B) call cudaFree(d_C) call cublasDestroy(handle) end program
Compile:
$ module switch PrgEnv-pgi PrgEnv-gnu $ module load cudatoolkit $ nvcc -c cublas_fortran.cu $ ftn -lcublas cublas_fortran_iso.f90 cublas.f90 cublas_fortran.o -o cublas.out
Run:
$ module load cudatoolkit $ aprun ./cublas.out
Library-provided Batching in Fortran
The most efficient way to increase GPU utilization for small library operations is to use library provided batch capabilities. In the code sample below we use cublasDgemmBatched
, which takes as an argument an array of pointers to matrices and the number of DGEMM
operations to preform in total. By allowing the library to handle the batching high performance can be achieved with minimal additional effort on the part of the programmer.
cublas-batch.f90
program main use ISO_C_BINDING use cublas_f implicit none integer :: dim, stat, i, j, k, batch_count integer(8) :: bytes real(8),dimension(:,:,:), pointer:: A, B, C real(8) :: alpha, beta, index, sum type(C_PTR) :: d_A, d_B, d_C type(C_PTR), dimension(:), pointer :: h_d_A, h_d_B, h_d_C, streams type(C_PTR) :: handle integer :: sizeof_double parameter (sizeof_double=8) !Linear dimension of matrices dim = 100 ! Number of A,B,C matrix sets batch_count = 1000 ! Allocate host storage for A,B,C square matrices allocate(A(dim,dim,batch_count)) allocate(B(dim,dim,batch_count)) allocate(C(dim,dim,batch_count)) ! Create host pointer array to device matrix storage allocate(h_d_A(batch_count)) allocate(h_d_B(batch_count)) allocate(h_d_C(batch_count)) bytes = dim*dim*sizeof_double do i=1,batch_count stat = cudaMalloc(h_d_A(i), bytes) stat = cudaMalloc(h_d_B(i), bytes) stat = cudaMalloc(h_d_C(i), bytes) enddo ! Copy the host array of device pointers to the device bytes = batch_count*sizeof_double ! 8 byte pointers stat = cudaMalloc(d_A, bytes) stat = cudaMalloc(d_B, bytes) stat = cudaMalloc(d_C, bytes) stat = cudaMemcpy(d_A, C_LOC(h_d_A(1)), bytes, cudaMemcpyHostToDevice); stat = cudaMemcpy(d_B, C_LOC(h_d_B(1)), bytes, cudaMemcpyHostToDevice); stat = cudaMemcpy(d_C, C_LOC(h_d_C(1)), bytes, cudaMemcpyHostToDevice); ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2 ! Matrices are arranged column major do k=1,batch_count do j=1,dim do i=1,dim if (i==j) then index = real(j*dim + i) A(i,j,k) = k*sin(index) B(i,j,k) = sin(index) C(i,j,k) = k*cos(index) * cos(index) else A(i,j,k) = 0.0 B(i,j,k) = 0.0 C(i,j,k) = 0.0 endif enddo ! i enddo ! j enddo ! k ! Create cublas instance stat = cublasCreate(handle) ! Set input matrices on device do i=1,batch_count stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(A(1,1,i)), dim, h_d_A(i), dim) stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(B(1,1,i)), dim, h_d_B(i), dim) stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(C(1,1,i)), dim, h_d_C(i), dim) enddo ! Set matrix coefficients alpha = 1.0 beta = 1.0 ! batched DGEMM: C = alpha*A*B + beta*C stat = cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, & dim, dim, dim, & alpha, & d_A, dim, & d_B, dim, & beta, & d_C, dim, & batch_count) ! Retrieve result matrix from device do i=1,batch_count stat = cublasGetMatrix(dim, dim, sizeof_double, h_d_C(i), dim, C_LOC(C(1,1,i)), dim) enddo ! Simple sanity test, sum up all elements sum = 0.0 do k=1,batch_count do j=1,dim do i=1,dim sum = sum + C(i,j,k) enddo enddo enddo print *, "Sum is:", sum, "should be: ", dim*(batch_count)*(batch_count+1)/2 do i=1,batch_count call cudaFree(h_d_A(i)) call cudaFree(h_d_B(i)) call cudaFree(h_d_C(i)) enddo deallocate(A) deallocate(B) deallocate(C) call cublasDestroy(handle) end program
Compile:
$ module switch PrgEnv-pgi PrgEnv-gnu $ module load cudatoolkit $ nvcc -c cublas_fortran.cu $ ftn -lcublas cublas_fortran_iso.f90 cublas-batch.f90 cublas_fortran.o -o cublas-batch.out
Run:
$ module load cudatoolkit $ aprun ./cublas-batch.out
CUDA Streams in Fortran
Not all library functions have a batch equivalent, in this case using CUDA streams
may be appropriate. Each stream can be considered a seperate work queue, allowing kernels from separate streams to be executed concurrently if hardware resources allow. In the code sample below we place each DGEMM
operation in it’s own stream
, Allowing up to 32 concurrent operations through the use of Hyper-Q
. The cuBLAS
library provides functionality to create, set, and destroy CUDA streams
.
cublas-stream.f90
program main use ISO_C_BINDING use cublas_f implicit none integer :: dim, stat, i, j, k, batch_count integer(8) :: bytes real(8),dimension(:,:,:), pointer:: A, B, C real(8) :: alpha, beta, index, sum type(C_PTR), dimension(:), pointer :: d_A, d_B, d_C, streams type(C_PTR) :: handle integer :: sizeof_double parameter (sizeof_double=8) !Linear dimension of matrices dim = 100 ! Number of A,B,C matrix sets batch_count = 1000 ! Allocate host storage for A,B,C square matrices allocate(A(dim,dim,batch_count)) allocate(B(dim,dim,batch_count)) allocate(C(dim,dim,batch_count)) ! Allocate device storage for A,B,C allocate(d_A(batch_count)) allocate(d_B(batch_count)) allocate(d_C(batch_count)) bytes = int(dim*dim*sizeof_double, 8) do i=1,batch_count stat = cudaMalloc(d_A(i), bytes) stat = cudaMalloc(d_B(i), bytes) stat = cudaMalloc(d_C(i), bytes) enddo ! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2 ! Matrices are arranged column major do k=1,batch_count do j=1,dim do i=1,dim if (i==j) then index = real(j*dim + i) A(i,j,k) = k*sin(index) B(i,j,k) = sin(index) C(i,j,k) = k*cos(index) * cos(index) else A(i,j,k) = 0.0 B(i,j,k) = 0.0 C(i,j,k) = 0.0 endif enddo ! i enddo ! j enddo ! k ! Create cublas instance stat = cublasCreate(handle) ! Set input matrices on device do i=1,batch_count stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(A(1,1,i)), dim, d_A(i), dim) stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(B(1,1,i)), dim, d_B(i), dim) stat = cublasSetMatrix(dim, dim, sizeof_double, C_LOC(C(1,1,i)), dim, d_C(i), dim) enddo ! Create a stream for every DGEMM operation allocate(streams(batch_count)) do i=1,batch_count stat = cudaStreamCreate(streams(i)) enddo ! Set matrix coefficients alpha = 1.0 beta = 1.0 ! Launch each DGEMM operation in own CUDA stream do i=1,batch_count ! Set CUDA stream stat = cublasSetStream(handle, streams(i)) ! DGEMM: C = alpha*A*B + beta*C stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, & dim, dim, dim, & alpha, & d_A(i), dim, & d_B(i), dim, & beta, & d_C(i), dim) enddo ! Retrieve result matrix from device do i=1,batch_count stat = cublasGetMatrix(dim, dim, sizeof_double, d_C(i), dim, C_LOC(C(1,1,i)), dim) enddo ! Simple sanity test, sum up all elements sum = 0.0 do k=1,batch_count do j=1,dim do i=1,dim sum = sum + C(i,j,k) enddo enddo enddo print *, "Sum is:", sum, "should be: ", dim*(batch_count)*(batch_count+1)/2 do i=1,batch_count stat = cudaStreamDestroy(streams(i)) call cudaFree(d_A(i)) call cudaFree(d_B(i)) call cudaFree(d_C(i)) enddo deallocate(A) deallocate(B) deallocate(C) call cublasDestroy(handle) end program
Compile:
$ module switch PrgEnv-pgi PrgEnv-gnu $ module load cudatoolkit $ nvcc -c cublas_fortran.cu $ ftn -lcublas cublas_fortran_iso.f90 cublas-stream.f90 cublas_fortran.o -o cublas-stream.out
Run:
$ module load cudatoolkit $ aprun ./cublas-stream.out