[cfe-dev] Clang and CUDA with C++11 features

Wed Jun 13 21:06:42 PDT 2012

On Wed, Jun 13, 2012 at 11:28:48PM -0400, Peter Colberg wrote:
> The parser interprets the compressed C++11 template parameter syntax
> as a call to a CUDA kernel function. Is there a way to disable parsing
> of the CUDA call syntax <<< >>>? I would be using a C++ wrapper around
> cudaConfigureCall, cudaSetupArgument and cudaLaunch anyway.

Hmmm, the old cudaLaunch trick does not seem to work:

    #include <cuda_runtime.h>

    #include <cstdio>
    #include <cstdlib>

    #define CUDA_REQUIRE( x ) \
        { \
            cudaError_t err = (x); \
            if (err != cudaSuccess) { \
                fprintf( \
                    stderr \
                  , "%s (%d): error: CUDA: %s\n" \
                  , __FILE__ , __LINE__ \
                  , cudaGetErrorString(err) \
                ); \
                exit(1); \
            } \
        }

    __attribute__((global)) void g1(int x, int* g_array)
    {
        g_array[0] = x;
    }

    int main()
    {
        int* g_array = 0;
        CUDA_REQUIRE( cudaMalloc(&g_array, sizeof(*g_array)) );
        CUDA_REQUIRE( cudaMemset(g_array, 0, sizeof(*g_array)) );

        int dev = -1;
        CUDA_REQUIRE( cudaGetDevice(&dev) );
        printf("Using CUDA device #%d\n", dev);

        struct arguments
        {
            int x;
            int* g_array;
        };

        int x = 42;
    #ifdef USE_CUDA_CALL_SYNTAX
        g1<<<1, 1>>>(x, g_array);
    #else
        CUDA_REQUIRE( cudaConfigureCall(1, 1) );
        CUDA_REQUIRE( cudaSetupArgument(&x, sizeof(x), offsetof(arguments, x)) );
        CUDA_REQUIRE( cudaSetupArgument(&g_array, sizeof(g_array), offsetof(arguments, g_array)) );
        CUDA_REQUIRE( cudaLaunch(reinterpret_cast<char const*>(&g1)) );
    #endif
        CUDA_REQUIRE( cudaDeviceSynchronize() );

        int result = 0;
        CUDA_REQUIRE( cudaMemcpy(&result, g_array, sizeof(*g_array), cudaMemcpyDeviceToHost) );
        printf("42 == %d\n", result);
    }

Compile with Clang using <<< >>> syntax:

    clang++ -DUSE_CUDA_CALL_SYNTAX -I/usr/local/cuda-4.2/cuda/include -L/usr/local/cuda-4.2/cuda/lib64 -lcudart -o kernel-call kernel-call.cu

    ./kernel-call 
    Using CUDA device #0
    42 == 0

Compile with Clang using manual cudaLaunch:

    clang++ -I/usr/local/cuda-4.2/cuda/include -L/usr/local/cuda-4.2/cuda/lib64 -lcudart -o kernel-call kernel-call.cu

    ./kernel-call 
    Using CUDA device #0
    kernel-call.cu (48): error: CUDA: invalid device function 

Compile with nvcc using manual cudaLaunch:

    nvcc -I/usr/local/cuda-4.2/cuda/include -L/usr/local/cuda-4.2/cuda/lib64 -lcudart -o kernel-call kernel-call.cu

    ./kernel-call 
    Using CUDA device #0
    42 == 42

How does the glue between host and GPU kernel work?

Could I somehow obtain a cudaLaunch-callable pointer to the GPU kernel?

The other issue is the result of kernel execution. The kernel appears
not to write to the global device memory array. I guess this also
involves the NVPTX backend. Should I forward this to the llvm-dev
mailing list, or are the NVPTX developers reading cfe-dev as well?

Thanks,
Peter