titan

Up since 11/8/17 02:45 pm

eos

Up since 11/14/17 11:20 pm

rhea

Up since 10/17/17 05:40 pm

hpss

Up since 11/20/17 09:15 am

atlas1

Up since 11/15/17 07:25 am

atlas2

Up since 11/27/17 10:45 am
OLCF User Assistance Center

Can't find the information you need below? Need advice from a real person? We're here to help.

OLCF support consultants are available to respond to your emails and phone calls from 9:00 a.m. to 5:00 p.m. EST, Monday through Friday, exclusive of holidays. Emails received outside of regular support hours will be addressed the next business day.

Compiling mixed GPU and CPU code

 

Introduction

This tutorial will cover linking GPU code compiled with NVCC and CPU code compiled with the Cray compiler wrappers cc/CC/ftn. The Cray compiler wrappers work in conjunction with the modules system to link in needed libraries such as MPI; it is therefore recommended that the wrappers be used to compile the CPU portions of your code. To generate compiler portable code it is necessary to compile CUDA C and CUDA runtime containing code with NVCC. The resulting NVCC compiled object code must then be linked in with object code compiled with the Cray wrappers. NVCC preforms GNU style C++ name mangling on compiled functions so care must be taken in compiling and linking codes. The full source can be viewed or downloaded from the OLCF GitHub. Please direct any questions or comments to help@nccs.gov

C++

When linking C++ host code with NVCC compiled code the C++ code must use GNU compatible name mangling. This is controlled through compiler wrapper flags.

vecAddWrapperCXX.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(float *a, float *b, float *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;

    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

void vecAdd_wrapper()
{
    // Size of vectors
    int n = 100000;

    // Host input vectors
    float *h_a;
    float *h_b;
    //Host output vector
    float *h_c;

    // Device input vectors
    float *d_a;
    float *d_b;
    //Device output vector
    float *d_c;

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(float);

    // Allocate memory for each vector on host
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }

    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n/blockSize);

    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);

    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);
}

vecAdd.cxx

#include <stdio.h>

void vecAdd_wrapper(void);

int main( int argc, char* argv[] )
{
    vecAdd_wrapper();

    return 0;
}

Compiling

PGI :

$ module load cudatoolkit
$ nvcc -c vecAddWrapperCXX.cu
$ CC --gnu vecAdd.cxx vecAddWrapperCXX.o

Cray :

$ module load cudatoolkit
$ nvcc -c vecAddWrapperCXX.cu
$ CC vecAdd.cxx vecAddWrapperCXX.o

GNU :

$ module load cudatoolkit
$ nvcc -c vecAddWrapperCXX.cu
$ CC vecAdd.cxx vecAddWrapperCXX.o

Intel :

$ module load cudatoolkit
$ nvcc -c vecAddWrapperCXX.cu
$ CC vecAdd.cxx vecAddWrapperCXX.o

 

C

NVCC name mangling must be disabled if it is to be linked with C code. This requires the extern “C” function qualifier.

vecAddWrapperC.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(float *a, float *b, float *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;

    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

extern "C" void vecAdd_wrapper()
{
    // Size of vectors
    int n = 100000;

    // Host input vectors
    float *h_a;
    float *h_b;
    //Host output vector
    float *h_c;

    // Device input vectors
    float *d_a;
    float *d_b;
    //Device output vector
    float *d_c;

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(float);

    // Allocate memory for each vector on host
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }

    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n/blockSize);

    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);

    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);
}

vecAdd.c

#include <stdio.h>

void vecAdd_wrapper(void);

int main( int argc, char* argv[] )
{
    vecAdd_wrapper();

    return 0;
}

Compiling:

There should be no compiler dependent variances for C.

$ module load cudatoolkit
$ nvcc -c vecAddWrapperC.cu
$ cc vecAdd.c vecAddWrapperC.o

Fortran: Simple

This simple method allows C code to be called from Fortran but requires that the C function name be modified. NVCC name mangling must be disabled if it is to be linked with Fortran code. This requires the extern “C” function qualifier. Additionally function names must be lowercase and end in an underscore, _ . The ISO_C_BINDING method following this example allows you to call C functions from Fortran without modifying the C function call.

vecAddWrapperF.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(float *a, float *b, float *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;

    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

extern "C" void vecadd_wrapper_()
{
    // Size of vectors
    int n = 100000;

    // Host input vectors
    float *h_a;
    float *h_b;
    //Host output vector
    float *h_c;

    // Device input vectors
    float *d_a;
    float *d_b;
    //Device output vector
    float *d_c;

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(float);

    // Allocate memory for each vector on host
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }

    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n/blockSize);

    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);

    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);
}

vecAdd.f90

program vecAdd
    call vecadd_wrapper()
end program vecAdd

Compiling:

There should be no compiler dependent variations for fortran.

$ module load cudatoolkit
$ nvcc -c vecAddWrapperF90.cu 
$ ftn vecAddWrapperF90.o vecAdd.f90

Fortran: ISO_C_BINDING

The ISO_C_BINDING provides Fortran a greater interoperability with C and reduces the need to modify the C function name.

vecAddWrapperISO.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(float *a, float *b, float *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;

    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

extern "C" void vecAdd_wrapper()
{
    // Size of vectors
    int n = 100000;

    // Host input vectors
    float *h_a;
    float *h_b;
    //Host output vector
    float *h_c;

    // Device input vectors
    float *d_a;
    float *d_b;
    //Device output vector
    float *d_c;

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(float);

    // Allocate memory for each vector on host
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }

    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n/blockSize);

    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);

    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);
}

vecAddISO.f90

module vector
  INTERFACE
    subroutine vecadd_wrapper() BIND (C, NAME='vecAdd_wrapper')
      USE ISO_C_BINDING
      implicit none
    end subroutine vecadd_wrapper
  END INTERFACE
end module vector

program vecAdd
    use ISO_C_BINDING
    use vector

    call vecadd_wrapper()

end program vecAdd

Compiling:

There should be no compiler dependent variations for fortran.

$ module load cudatoolkit
$ nvcc -c vecAddWrapperISO.cu 
$ ftn vecAddWrapperISO.o vecAddISO.f90