[cfe-dev] Clang and CUDA with C++11 features

Thu Jun 14 11:08:26 PDT 2012

On Thu, Jun 14, 2012 at 09:57:27AM -0400, Justin Holewinski wrote:
> Something is probably getting messed up in the IR generation for the kernel
> functions (we recently got rid of the old PTX back-end, on which the CUDA
> integration is based, in favor of the NVPTX back-end).  If you post the IR,
> I can take a look at it.

I attached the CUDA source and the LLVM IR.

clang++ -S -emit-llvm -I/usr/local/cuda-4.2/cuda/include -o kernel-call.s kernel-call.cu

clang++ -I/usr/local/cuda-4.2/cuda/include -L/usr/local/cuda-4.2/cuda/lib64 -lcudart -o kernel-call kernel-call.cu

./kernel-call 
Using CUDA device #0
42 == 0

clang++ --version
clang version 3.2 (http://llvm.org/git/clang.git f1492f970c7c6eb85dc18f13fb864b185bed1d23) (http://llvm.org/git/llvm.git cba91230c0beef79e5042d8e983198b26aac5616)
Target: x86_64-unknown-linux-gnu
Thread model: posix

Thanks,
Peter
-------------- next part --------------
A non-text attachment was scrubbed...
Name: kernel-call.cu
Type: application/cu-seeme
Size: 1236 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20120614/cf3f414a/attachment.cu>
-------------- next part --------------
; ModuleID = 'kernel-call.cu'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
%struct.dim3 = type { i32, i32, i32 }
%struct.CUstream_st = type opaque

@stderr = external global %struct._IO_FILE*
@.str = private unnamed_addr constant [26 x i8] c"%s (%d): error: CUDA: %s\0A\00", align 1
@.str1 = private unnamed_addr constant [15 x i8] c"kernel-call.cu\00", align 1
@.str2 = private unnamed_addr constant [23 x i8] c"Using CUDA device #%d\0A\00", align 1
@.str3 = private unnamed_addr constant [10 x i8] c"42 == %d\0A\00", align 1

define void @_Z2g1iPi(i32 %x, i32* %g_array) uwtable {
entry:
  %x.addr = alloca i32, align 4
  %g_array.addr = alloca i32*, align 8
  store i32 %x, i32* %x.addr, align 4
  store i32* %g_array, i32** %g_array.addr, align 8
  %0 = bitcast i32* %x.addr to i8*
  %1 = call i32 @cudaSetupArgument(i8* %0, i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64), i64 0)
  %2 = icmp eq i32 %1, 0
  br i1 %2, label %setup.next, label %setup.end

setup.next:                                       ; preds = %entry
  %3 = bitcast i32** %g_array.addr to i8*
  %4 = call i32 @cudaSetupArgument(i8* %3, i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64), i64 ptrtoint (i32** getelementptr ({ i32, i32* }* null, i64 0, i32 1) to i64))
  %5 = icmp eq i32 %4, 0
  br i1 %5, label %setup.next1, label %setup.end

setup.next1:                                      ; preds = %setup.next
  %6 = call i32 @cudaLaunch(i8* bitcast (void (i32, i32*)* @_Z2g1iPi to i8*))
  br label %setup.end

setup.end:                                        ; preds = %setup.next1, %setup.next, %entry
  ret void
}

declare i32 @cudaSetupArgument(i8*, i64, i64)

declare i32 @cudaLaunch(i8*)

define i32 @main() uwtable {
entry:
  %retval = alloca i32, align 4
  %g_array = alloca i32*, align 8
  %err = alloca i32, align 4
  %err3 = alloca i32, align 4
  %dev = alloca i32, align 4
  %err10 = alloca i32, align 4
  %agg.tmp = alloca %struct.dim3, align 4
  %agg.tmp18 = alloca %struct.dim3, align 4
  %err20 = alloca i32, align 4
  %result = alloca i32, align 4
  %err27 = alloca i32, align 4
  store i32 0, i32* %retval
  store i32* null, i32** %g_array, align 8
  %call = call i32 @_Z10cudaMallocIiE9cudaErrorPPT_m(i32** %g_array, i64 4)
  store i32 %call, i32* %err, align 4
  %0 = load i32* %err, align 4
  %cmp = icmp ne i32 %0, 0
  br i1 %cmp, label %if.then, label %if.end

if.then:                                          ; preds = %entry
  %1 = load %struct._IO_FILE** @stderr, align 8
  %2 = load i32* %err, align 4
  %call1 = call i8* @cudaGetErrorString(i32 %2)
  %call2 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 27, i8* %call1)
  call void @exit(i32 1) noreturn nounwind
  unreachable

if.end:                                           ; preds = %entry
  %3 = load i32** %g_array, align 8
  %4 = bitcast i32* %3 to i8*
  %call4 = call i32 @cudaMemset(i8* %4, i32 0, i64 4)
  store i32 %call4, i32* %err3, align 4
  %5 = load i32* %err3, align 4
  %cmp5 = icmp ne i32 %5, 0
  br i1 %cmp5, label %if.then6, label %if.end9

if.then6:                                         ; preds = %if.end
  %6 = load %struct._IO_FILE** @stderr, align 8
  %7 = load i32* %err3, align 4
  %call7 = call i8* @cudaGetErrorString(i32 %7)
  %call8 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %6, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 28, i8* %call7)
  call void @exit(i32 1) noreturn nounwind
  unreachable

if.end9:                                          ; preds = %if.end
  store i32 -1, i32* %dev, align 4
  %call11 = call i32 @cudaGetDevice(i32* %dev)
  store i32 %call11, i32* %err10, align 4
  %8 = load i32* %err10, align 4
  %cmp12 = icmp ne i32 %8, 0
  br i1 %cmp12, label %if.then13, label %if.end16

if.then13:                                        ; preds = %if.end9
  %9 = load %struct._IO_FILE** @stderr, align 8
  %10 = load i32* %err10, align 4
  %call14 = call i8* @cudaGetErrorString(i32 %10)
  %call15 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %9, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 31, i8* %call14)
  call void @exit(i32 1) noreturn nounwind
  unreachable

if.end16:                                         ; preds = %if.end9
  %11 = load i32* %dev, align 4
  %call17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8]* @.str2, i32 0, i32 0), i32 %11)
  call void @_ZN4dim3C1Ejjj(%struct.dim3* %agg.tmp, i32 1, i32 1, i32 1)
  call void @_ZN4dim3C1Ejjj(%struct.dim3* %agg.tmp18, i32 1, i32 1, i32 1)
  %12 = bitcast %struct.dim3* %agg.tmp to { i64, i32 }*
  %13 = getelementptr { i64, i32 }* %12, i32 0, i32 0
  %14 = load i64* %13, align 1
  %15 = getelementptr { i64, i32 }* %12, i32 0, i32 1
  %16 = load i32* %15, align 1
  %17 = bitcast %struct.dim3* %agg.tmp18 to { i64, i32 }*
  %18 = getelementptr { i64, i32 }* %17, i32 0, i32 0
  %19 = load i64* %18, align 1
  %20 = getelementptr { i64, i32 }* %17, i32 0, i32 1
  %21 = load i32* %20, align 1
  %call19 = call i32 @cudaConfigureCall(i64 %14, i32 %16, i64 %19, i32 %21, i64 0, %struct.CUstream_st* null)
  %tobool = icmp ne i32 %call19, 0
  br i1 %tobool, label %kcall.end, label %kcall.configok

kcall.configok:                                   ; preds = %if.end16
  %22 = load i32** %g_array, align 8
  call void @_Z2g1iPi(i32 42, i32* %22)
  br label %kcall.end

kcall.end:                                        ; preds = %kcall.configok, %if.end16
  %call21 = call i32 @cudaDeviceSynchronize()
  store i32 %call21, i32* %err20, align 4
  %23 = load i32* %err20, align 4
  %cmp22 = icmp ne i32 %23, 0
  br i1 %cmp22, label %if.then23, label %if.end26

if.then23:                                        ; preds = %kcall.end
  %24 = load %struct._IO_FILE** @stderr, align 8
  %25 = load i32* %err20, align 4
  %call24 = call i8* @cudaGetErrorString(i32 %25)
  %call25 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %24, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 35, i8* %call24)
  call void @exit(i32 1) noreturn nounwind
  unreachable

if.end26:                                         ; preds = %kcall.end
  store i32 0, i32* %result, align 4
  %26 = bitcast i32* %result to i8*
  %27 = load i32** %g_array, align 8
  %28 = bitcast i32* %27 to i8*
  %call28 = call i32 @cudaMemcpy(i8* %26, i8* %28, i64 4, i32 2)
  store i32 %call28, i32* %err27, align 4
  %29 = load i32* %err27, align 4
  %cmp29 = icmp ne i32 %29, 0
  br i1 %cmp29, label %if.then30, label %if.end33

if.then30:                                        ; preds = %if.end26
  %30 = load %struct._IO_FILE** @stderr, align 8
  %31 = load i32* %err27, align 4
  %call31 = call i8* @cudaGetErrorString(i32 %31)
  %call32 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %30, i8* getelementptr inbounds ([26 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 38, i8* %call31)
  call void @exit(i32 1) noreturn nounwind
  unreachable

if.end33:                                         ; preds = %if.end26
  %32 = load i32* %result, align 4
  %call34 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str3, i32 0, i32 0), i32 %32)
  %33 = load i32* %retval
  ret i32 %33
}

define linkonce_odr i32 @_Z10cudaMallocIiE9cudaErrorPPT_m(i32** %devPtr, i64 %size) uwtable inlinehint {
entry:
  %devPtr.addr = alloca i32**, align 8
  %size.addr = alloca i64, align 8
  store i32** %devPtr, i32*** %devPtr.addr, align 8
  store i64 %size, i64* %size.addr, align 8
  %0 = load i32*** %devPtr.addr, align 8
  %1 = bitcast i32** %0 to i8*
  %2 = bitcast i8* %1 to i8**
  %3 = load i64* %size.addr, align 8
  %call = call i32 @cudaMalloc(i8** %2, i64 %3)
  ret i32 %call
}

declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)

declare i8* @cudaGetErrorString(i32)

declare void @exit(i32) noreturn nounwind

declare i32 @cudaMemset(i8*, i32, i64)

declare i32 @cudaGetDevice(i32*)

declare i32 @printf(i8*, ...)

declare i32 @cudaConfigureCall(i64, i32, i64, i32, i64, %struct.CUstream_st*)

define linkonce_odr void @_ZN4dim3C1Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr uwtable align 2 {
entry:
  %this.addr = alloca %struct.dim3*, align 8
  %vx.addr = alloca i32, align 4
  %vy.addr = alloca i32, align 4
  %vz.addr = alloca i32, align 4
  store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
  store i32 %vx, i32* %vx.addr, align 4
  store i32 %vy, i32* %vy.addr, align 4
  store i32 %vz, i32* %vz.addr, align 4
  %this1 = load %struct.dim3** %this.addr
  %0 = load i32* %vx.addr, align 4
  %1 = load i32* %vy.addr, align 4
  %2 = load i32* %vz.addr, align 4
  call void @_ZN4dim3C2Ejjj(%struct.dim3* %this1, i32 %0, i32 %1, i32 %2)
  ret void
}

declare i32 @cudaDeviceSynchronize()

declare i32 @cudaMemcpy(i8*, i8*, i64, i32)

declare i32 @cudaMalloc(i8**, i64)

define linkonce_odr void @_ZN4dim3C2Ejjj(%struct.dim3* %this, i32 %vx, i32 %vy, i32 %vz) unnamed_addr nounwind uwtable align 2 {
entry:
  %this.addr = alloca %struct.dim3*, align 8
  %vx.addr = alloca i32, align 4
  %vy.addr = alloca i32, align 4
  %vz.addr = alloca i32, align 4
  store %struct.dim3* %this, %struct.dim3** %this.addr, align 8
  store i32 %vx, i32* %vx.addr, align 4
  store i32 %vy, i32* %vy.addr, align 4
  store i32 %vz, i32* %vz.addr, align 4
  %this1 = load %struct.dim3** %this.addr
  %x = getelementptr inbounds %struct.dim3* %this1, i32 0, i32 0
  %0 = load i32* %vx.addr, align 4
  store i32 %0, i32* %x, align 4
  %y = getelementptr inbounds %struct.dim3* %this1, i32 0, i32 1
  %1 = load i32* %vy.addr, align 4
  store i32 %1, i32* %y, align 4
  %z = getelementptr inbounds %struct.dim3* %this1, i32 0, i32 2
  %2 = load i32* %vz.addr, align 4
  store i32 %2, i32* %z, align 4
  ret void
}