[cfe-dev] Clang and CUDA with C++11 features

Thu Jun 14 18:48:38 PDT 2012

On Thu, Jun 14, 2012 at 02:21:21PM -0400, Justin Holewinski wrote:
> The attached IR is for the host, not the device.  I haven't played around
> with the CUDA front-end in Clang, but I don't think the plumbing is hooked
> up to generate PTX device code and embed it into the final binary.  Someone
> who works on the front-end would be better able to comment.

I reduced the source file to this:

    // kernel.cu

    __attribute__((global)) void f(int* array)
    {
        array[0] = 42;
    }

clang++ -I/usr/local/cuda-4.2/cuda/include -S -emit-llvm -o kernel-x86_64.s kernel.cu

    ; ModuleID = 'kernel.cu'
    target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
    target triple = "x86_64-unknown-linux-gnu"

    define void @_Z1fPi(i32* %array) uwtable {
    entry:
      %array.addr = alloca i32*, align 8
      store i32* %array, i32** %array.addr, align 8
      %0 = bitcast i32** %array.addr to i8*
      %1 = call i32 @cudaSetupArgument(i8* %0, i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64), i64 0)
      %2 = icmp eq i32 %1, 0
      br i1 %2, label %setup.next, label %setup.end

    setup.next:                                       ; preds = %entry
      %3 = call i32 @cudaLaunch(i8* bitcast (void (i32*)* @_Z1fPi to i8*))
      br label %setup.end

    setup.end:                                        ; preds = %setup.next, %entry
      ret void
    }

    declare i32 @cudaSetupArgument(i8*, i64, i64)

    declare i32 @cudaLaunch(i8*)

clang -cc1 -fcuda-is-device -I/usr/local/cuda-4.2/cuda/include -emit-llvm -triple nvptx64-unknown-unknown -o kernel-nvptx64.s kernel.cu

    ; ModuleID = 'kernel.cu'
    target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
    target triple = "nvptx64-unknown-unknown"

    define ptx_kernel void @_Z1fPi(i32* %array) nounwind {
    entry:
      %array.addr = alloca i32*, align 8
      store i32* %array, i32** %array.addr, align 8
      %0 = load i32** %array.addr, align 8
      %arrayidx = getelementptr inbounds i32* %0, i64 0
      store i32 42, i32* %arrayidx, align 4
      ret void
    }

The second output does look like LLVM IR of device code.

How do I compile the PTX to object code?

How do I link device and host code together?

Why is there a cudaLaunch in the host LLVM IR despite omitted <<< >>> call?

Thanks,
Peter