[cfe-dev] Clang and CUDA with C++11 features
Peter Colberg
peter at colberg.org
Thu Jun 14 18:48:38 PDT 2012
On Thu, Jun 14, 2012 at 02:21:21PM -0400, Justin Holewinski wrote:
> The attached IR is for the host, not the device. I haven't played around
> with the CUDA front-end in Clang, but I don't think the plumbing is hooked
> up to generate PTX device code and embed it into the final binary. Someone
> who works on the front-end would be better able to comment.
I reduced the source file to this:
// kernel.cu
__attribute__((global)) void f(int* array)
{
array[0] = 42;
}
clang++ -I/usr/local/cuda-4.2/cuda/include -S -emit-llvm -o kernel-x86_64.s kernel.cu
; ModuleID = 'kernel.cu'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @_Z1fPi(i32* %array) uwtable {
entry:
%array.addr = alloca i32*, align 8
store i32* %array, i32** %array.addr, align 8
%0 = bitcast i32** %array.addr to i8*
%1 = call i32 @cudaSetupArgument(i8* %0, i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64), i64 0)
%2 = icmp eq i32 %1, 0
br i1 %2, label %setup.next, label %setup.end
setup.next: ; preds = %entry
%3 = call i32 @cudaLaunch(i8* bitcast (void (i32*)* @_Z1fPi to i8*))
br label %setup.end
setup.end: ; preds = %setup.next, %entry
ret void
}
declare i32 @cudaSetupArgument(i8*, i64, i64)
declare i32 @cudaLaunch(i8*)
clang -cc1 -fcuda-is-device -I/usr/local/cuda-4.2/cuda/include -emit-llvm -triple nvptx64-unknown-unknown -o kernel-nvptx64.s kernel.cu
; ModuleID = 'kernel.cu'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
define ptx_kernel void @_Z1fPi(i32* %array) nounwind {
entry:
%array.addr = alloca i32*, align 8
store i32* %array, i32** %array.addr, align 8
%0 = load i32** %array.addr, align 8
%arrayidx = getelementptr inbounds i32* %0, i64 0
store i32 42, i32* %arrayidx, align 4
ret void
}
The second output does look like LLVM IR of device code.
How do I compile the PTX to object code?
How do I link device and host code together?
Why is there a cudaLaunch in the host LLVM IR despite omitted <<< >>> call?
Thanks,
Peter
More information about the cfe-dev
mailing list