[cfe-dev] Clang and CUDA with C++11 features
Peter Colberg
peter at colberg.org
Thu Jun 14 11:08:26 PDT 2012
On Thu, Jun 14, 2012 at 09:57:27AM -0400, Justin Holewinski wrote:
> Something is probably getting messed up in the IR generation for the kernel
> functions (we recently got rid of the old PTX back-end, on which the CUDA
> integration is based, in favor of the NVPTX back-end). If you post the IR,
> I can take a look at it.
I attached the CUDA source and the LLVM IR.
clang++ -S -emit-llvm -I/usr/local/cuda-4.2/cuda/include -o kernel-call.s kernel-call.cu
clang++ -I/usr/local/cuda-4.2/cuda/include -L/usr/local/cuda-4.2/cuda/lib64 -lcudart -o kernel-call kernel-call.cu
./kernel-call
Using CUDA device #0
42 == 0
clang++ --version
clang version 3.2 (http://llvm.org/git/clang.git f1492f970c7c6eb85dc18f13fb864b185bed1d23) (http://llvm.org/git/llvm.git cba91230c0beef79e5042d8e983198b26aac5616)
Target: x86_64-unknown-linux-gnu
Thread model: posix
Thanks,
Peter
-------------- next part --------------
A non-text attachment was scrubbed...
Name: kernel-call.cu
Type: application/cu-seeme
Size: 1236 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20120614/cf3f414a/attachment.cu>
-------------- next part --------------
; ModuleID = 'kernel-call.cu'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque
@stderr = external global %struct._IO_FILE*
@.str = private unnamed_addr constant [26 x i8] c"%s (%d): error: CUDA: %s\0A\00", align 1
@.str1 = private unnamed_addr constant [15 x i8] c"kernel-call.cu\00", align 1
@.str2 = private unnamed_addr constant [23 x i8] c"Using CUDA device #%d\0A\00", align 1
@.str3 = private unnamed_addr constant [10 x i8] c"42 == %d\0A\00", align 1
define void @_Z2g1iPi(i32 %x, i32* %g_array) uwtable {
entry:
%x.addr = alloca i32, align 4
%g_array.addr = alloca i32*, align 8
store i32 %x, i32* %x.addr, align 4
store i32* %g_array, i32** %g_array.addr, align 8
%0 = bitcast i32* %x.addr to i8*
%1 = call i32 @cudaSetupArgument(i8* %0, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64), i64 0)
%2 = icmp eq i32 %1, 0
br i1 %2, label %setup.next, label %setup.end
setup.next: ; preds = %entry
%3 = bitcast i32** %g_array.addr to i8*
%4 = call i32 @cudaSetupArgument(i8* %3, i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64), i64 ptrtoint (i32** getelementptr ({ i32, i32* }* null, i64 0, i32 1) to i64))
%5 = icmp eq i32 %4, 0
br i1 %5, label %setup.next1, label %setup.end
setup.next1: ; preds = %setup.next
%6 = call i32 @cudaLaunch(i8* bitcast (void (i32, i32*)* @_Z2g1iPi to i8*))
br label %setup.end
setup.end: ; preds = %setup.next1, %setup.next, %entry
ret void
}
declare i32 @cudaSetupArgument(i8*, i64, i64)
declare i32 @cudaLaunch(i8*)
define i32 @main() uwtable {
entry:
%retval = alloca i32, align 4
%g_array = alloca i32*, align 8
%err = alloca i32, align 4
%err3 = alloca i32, align 4
%dev = alloca i32, align 4
%err10 = alloca i32, align 4
%agg.tmp = alloca %struct.dim3, align 4
%agg.tmp18 = alloca %struct.dim3, align 4
%err20 = alloca i32, align 4
%result = alloca i32, align 4
%err27 = alloca i32, align 4
store i32 0, i32* %retval
store i32* null, i32** %g_array, align 8
%call = call i32 @_Z10cudaMallocIiE9cudaErrorPPT_m(i32** %g_array, i64 4)
store i32 %call, i32* %err, align 4
%0 = load i32* %err, align 4
%cmp = icmp ne i32 %0, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%1 = load %struct._IO_FILE** @stderr, align 8
%2 = load i32* %err, align 4
%call1 = call i8* @cudaGetErrorString(i32 %2)
%call2 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 27, i8* %call1)
call void @exit(i32 1) noreturn nounwind
unreachable
if.end: ; preds = %entry
%3 = load i32** %g_array, align 8
%4 = bitcast i32* %3 to i8*
%call4 = call i32 @cudaMemset(i8* %4, i32 0, i64 4)
store i32 %call4, i32* %err3, align 4
%5 = load i32* %err3, align 4
%cmp5 = icmp ne i32 %5, 0
br i1 %cmp5, label %if.then6, label %if.end9
if.then6: ; preds = %if.end
%6 = load %struct._IO_FILE** @stderr, align 8
%7 = load i32* %err3, align 4
%call7 = call i8* @cudaGetErrorString(i32 %7)
%call8 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 28, i8* %call7)
call void @exit(i32 1) noreturn nounwind
unreachable
if.end9: ; preds = %if.end
store i32 -1, i32* %dev, align 4
%call11 = call i32 @cudaGetDevice(i32* %dev)
store i32 %call11, i32* %err10, align 4
%8 = load i32* %err10, align 4
%cmp12 = icmp ne i32 %8, 0
br i1 %cmp12, label %if.then13, label %if.end16
if.then13: ; preds = %if.end9
%9 = load %struct._IO_FILE** @stderr, align 8
%10 = load i32* %err10, align 4
%call14 = call i8* @cudaGetErrorString(i32 %10)
%call15 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %9, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 31, i8* %call14)
call void @exit(i32 1) noreturn nounwind
unreachable
if.end16: ; preds = %if.end9
%11 = load i32* %dev, align 4
%call17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str2, i32 0, i32 0), i32 %11)
call void @_ZN4dim3C1Ejjj(%struct.dim3* %agg.tmp, i32 1, i32 1, i32 1)
call void @_ZN4dim3C1Ejjj(%struct.dim3* %agg.tmp18, i32 1, i32 1, i32 1)
%12 = bitcast %struct.dim3* %agg.tmp to { i64, i32 }*
%13 = getelementptr { i64, i32 }* %12, i32 0, i32 0
%14 = load i64* %13, align 1
%15 = getelementptr { i64, i32 }* %12, i32 0, i32 1
%16 = load i32* %15, align 1
%17 = bitcast %struct.dim3* %agg.tmp18 to { i64, i32 }*
%18 = getelementptr { i64, i32 }* %17, i32 0, i32 0
%19 = load i64* %18, align 1
%20 = getelementptr { i64, i32 }* %17, i32 0, i32 1
%21 = load i32* %20, align 1
%call19 = call i32 @cudaConfigureCall(i64 %14, i32 %16, i64 %19, i32 %21, i64 0, %struct.CUstream_st* null)
%tobool = icmp ne i32 %call19, 0
br i1 %tobool, label %kcall.end, label %kcall.configok
kcall.configok: ; preds = %if.end16
%22 = load i32** %g_array, align 8
call void @_Z2g1iPi(i32 42, i32* %22)
br label %kcall.end
kcall.end: ; preds = %kcall.configok, %if.end16
%call21 = call i32 @cudaDeviceSynchronize()
store i32 %call21, i32* %err20, align 4
%23 = load i32* %err20, align 4
%cmp22 = icmp ne i32 %23, 0
br i1 %cmp22, label %if.then23, label %if.end26
if.then23: ; preds = %kcall.end
%24 = load %struct._IO_FILE** @stderr, align 8
%25 = load i32* %err20, align 4
%call24 = call i8* @cudaGetErrorString(i32 %25)
%call25 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %24, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 35, i8* %call24)
call void @exit(i32 1) noreturn nounwind
unreachable
if.end26: ; preds = %kcall.end
store i32 0, i32* %result, align 4
%26 = bitcast i32* %result to i8*
%27 = load i32** %g_array, align 8
%28 = bitcast i32* %27 to i8*
%call28 = call i32 @cudaMemcpy(i8* %26, i8* %28, i64 4, i32 2)
store i32 %call28, i32* %err27, align 4
%29 = load i32* %err27, align 4
%cmp29 = icmp ne i32 %29, 0
br i1 %cmp29, label %if.then30, label %if.end33
if.then30: ; preds = %if.end26
%30 = load %struct._IO_FILE** @stderr, align 8
%31 = load i32* %err27, align 4
%call31 = call i8* @cudaGetErrorString(i32 %31)
%call32 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %30, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 38, i8* %call31)
call void @exit(i32 1) noreturn nounwind
unreachable
if.end33: ; preds = %if.end26
%32 = load i32* %result, align 4
%call34 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str3, i32 0, i32 0), i32 %32)
%33 = load i32* %retval
ret i32 %33
}
define linkonce_odr i32 @_Z10cudaMallocIiE9cudaErrorPPT_m(i32** %devPtr, i64 %size) uwtable inlinehint {
entry:
%devPtr.addr = alloca i32**, align 8
%size.addr = alloca i64, align 8
store i32** %devPtr, i32*** %devPtr.addr, align 8
store i64 %size, i64* %size.addr, align 8
%0 = load i32*** %devPtr.addr, align 8
%1 = bitcast i32** %0 to i8*
%2 = bitcast i8* %1 to i8**
%3 = load i64* %size.addr, align 8
%call = call i32 @cudaMalloc(i8** %2, i64 %3)
ret i32 %call
}
declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
declare i8* @cudaGetErrorString(i32)
declare void @exit(i32) noreturn nounwind
declare i32 @cudaMemset(i8*, i32, i64)
declare i32 @cudaGetDevice(i32*)
declare i32 @printf(i8*, ...)
declare i32 @cudaConfigureCall(i64, i32, i64, i32, i64, %struct.CUstream_st*)
define linkonce_odr void @_ZN4dim3C1Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr uwtable align 2 {
entry:
%this.addr = alloca %struct.dim3*, align 8
%vx.addr = alloca i32, align 4
%vy.addr = alloca i32, align 4
%vz.addr = alloca i32, align 4
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
store i32 %vx, i32* %vx.addr, align 4
store i32 %vy, i32* %vy.addr, align 4
store i32 %vz, i32* %vz.addr, align 4
%this1 = load %struct.dim3** %this.addr
%0 = load i32* %vx.addr, align 4
%1 = load i32* %vy.addr, align 4
%2 = load i32* %vz.addr, align 4
call void @_ZN4dim3C2Ejjj(%struct.dim3* %this1, i32 %0, i32 %1, i32 %2)
ret void
}
declare i32 @cudaDeviceSynchronize()
declare i32 @cudaMemcpy(i8*, i8*, i64, i32)
declare i32 @cudaMalloc(i8**, i64)
define linkonce_odr void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr nounwind uwtable align 2 {
entry:
%this.addr = alloca %struct.dim3*, align 8
%vx.addr = alloca i32, align 4
%vy.addr = alloca i32, align 4
%vz.addr = alloca i32, align 4
store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
store i32 %vx, i32* %vx.addr, align 4
store i32 %vy, i32* %vy.addr, align 4
store i32 %vz, i32* %vz.addr, align 4
%this1 = load %struct.dim3** %this.addr
%x = getelementptr inbounds %struct.dim3* %this1, i32 0, i32 0
%0 = load i32* %vx.addr, align 4
store i32 %0, i32* %x, align 4
%y = getelementptr inbounds %struct.dim3* %this1, i32 0, i32 1
%1 = load i32* %vy.addr, align 4
store i32 %1, i32* %y, align 4
%z = getelementptr inbounds %struct.dim3* %this1, i32 0, i32 2
%2 = load i32* %vz.addr, align 4
store i32 %2, i32* %z, align 4
ret void
}
More information about the cfe-dev
mailing list