[llvm] r326806 - [AMDGPU] Fix lowering OpenCL enqueue_kernel
Yaxun Liu via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 6 08:04:39 PST 2018
Author: yaxunl
Date: Tue Mar 6 08:04:39 2018
New Revision: 326806
URL: http://llvm.org/viewvc/llvm-project?rev=326806&view=rev
Log:
[AMDGPU] Fix lowering OpenCL enqueue_kernel
One addrspacecast disappeared in clang emitted IR for
block invoke function due to adoption of the new
addr space mapping.
Differential Revision: https://reviews.llvm.org/D43785
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp?rev=326806&r1=326805&r2=326806&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp Tue Mar 6 08:04:39 2018
@@ -94,35 +94,33 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::
bool Changed = false;
for (auto &F : M.functions()) {
if (F.hasFnAttribute("enqueued-block")) {
- if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
- !isa<ConstantExpr>(*F.user_begin()) ||
- !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
- continue;
- }
- auto *BitCast = cast<ConstantExpr>(*F.user_begin());
- auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
- auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
- auto *GV = new GlobalVariable(
- M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
- /*IsConstant=*/true, GlobalValue::ExternalLinkage,
- /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
- GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
- /*IsExternallyInitialized=*/true);
- DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
- auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
- AddrCast->replaceAllUsesWith(NewPtr);
- F.addFnAttr("runtime-handle", RuntimeHandle);
- F.setLinkage(GlobalValue::ExternalLinkage);
+ for (auto U : F.users()) {
+ if (!isa<ConstantExpr>(&*U))
+ continue;
+ auto *BitCast = cast<ConstantExpr>(&*U);
+ auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+ auto *GV = new GlobalVariable(
+ M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
+ /*IsConstant=*/true, GlobalValue::ExternalLinkage,
+ /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
+ /*IsExternallyInitialized=*/true);
+ DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+ auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
+ BitCast->replaceAllUsesWith(NewPtr);
+ F.addFnAttr("runtime-handle", RuntimeHandle);
+ F.setLinkage(GlobalValue::ExternalLinkage);
- // Collect direct or indirect callers of enqueue_kernel.
- for (auto U : NewPtr->users()) {
- if (auto *I = dyn_cast<Instruction>(&*U)) {
- auto *F = I->getParent()->getParent();
- Callers.insert(F);
- collectCallers(F, Callers);
+ // Collect direct or indirect callers of enqueue_kernel.
+ for (auto U : NewPtr->users()) {
+ if (auto *I = dyn_cast<Instruction>(&*U)) {
+ auto *F = I->getParent()->getParent();
+ Callers.insert(F);
+ collectCallers(F, Callers);
+ }
}
+ Changed = true;
}
- Changed = true;
}
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll?rev=326806&r1=326805&r2=326806&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll Tue Mar 6 08:04:39 2018
@@ -1,11 +1,8 @@
-; RUN: opt -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
+; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
-target triple = "amdgcn-amdhsa-amd-opencl"
-
%struct.ndrange_t = type { i32 }
%opencl.queue_t = type opaque
@@ -26,66 +23,64 @@ define amdgpu_kernel void @caller_indire
define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
- %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
- %tmp = alloca %struct.ndrange_t, align 4
- %block2 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8
- %tmp3 = alloca %struct.ndrange_t, align 4
- %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 0
- store i32 25, i32* %block.size, align 8
- %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 1
- store i32 8, i32* %block.align, align 4
- %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 2
- store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 8
- %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 3
- store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured, align 8
- %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 4
- store i8 %b, i8* %block.captured1, align 8
- %tmp1 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block to void ()*
- %tmp2 = bitcast void ()* %tmp1 to i8*
- %tmp4 = addrspacecast i8* %tmp2 to i8 addrspace(4)*
- %tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp, i8 addrspace(4)* nonnull %tmp4) #2
- %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 0
- store i32 41, i32* %block.size4, align 8
- %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 1
- store i32 8, i32* %block.align5, align 4
- %block.invoke6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 2
- store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke6, align 8
- %block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 3
- store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured7, align 8
- %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 6
- store i8 %b, i8* %block.captured8, align 8
- %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 4
- store i64 addrspace(1)* %c, i64 addrspace(1)** %block.captured9, align 8
- %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 5
- store i64 %d, i64* %block.captured10, align 8
- %tmp6 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2 to void ()*
- %tmp7 = bitcast void ()* %tmp6 to i8*
- %tmp8 = addrspacecast i8* %tmp7 to i8 addrspace(4)*
- %tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp3, i8 addrspace(4)* nonnull %tmp8) #2
+ %block = alloca <{ i32, i32, i8 addrspace(1)*, i8 }>, align 8, addrspace(5)
+ %tmp = alloca %struct.ndrange_t, align 4, addrspace(5)
+ %block2 = alloca <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8, addrspace(5)
+ %tmp3 = alloca %struct.ndrange_t, align 4, addrspace(5)
+ %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(1)*, i8 }> addrspace(5)* %block, i32 0, i32 0
+ store i32 25, i32 addrspace(5)* %block.size, align 8
+ %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(1)*, i8 }> addrspace(5)* %block, i32 0, i32 1
+ store i32 8, i32 addrspace(5)* %block.align, align 4
+ %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(1)*, i8 }> addrspace(5)* %block, i32 0, i32 2
+ store i8 addrspace(1)* %a, i8 addrspace(1)* addrspace(5)* %block.captured, align 8
+ %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(1)*, i8 }> addrspace(5)* %block, i32 0, i32 3
+ store i8 %b, i8 addrspace(5)* %block.captured1, align 8
+ %tmp1 = bitcast <{ i32, i32, i8 addrspace(1)*, i8 }> addrspace(5)* %block to void () addrspace(5)*
+ %tmp4 = addrspacecast void () addrspace(5)* %tmp1 to i8*
+ %tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t addrspace(5)* byval nonnull %tmp,
+ i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*), i8* nonnull %tmp4) #2
+ %tmp10 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t addrspace(5)* byval nonnull %tmp,
+ i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*), i8* nonnull %tmp4) #2
+ %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 0
+ store i32 41, i32 addrspace(5)* %block.size4, align 8
+ %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 1
+ store i32 8, i32 addrspace(5)* %block.align5, align 4
+ %block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 2
+ store i8 addrspace(1)* %a, i8 addrspace(1)* addrspace(5)* %block.captured7, align 8
+ %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 5
+ store i8 %b, i8 addrspace(5)* %block.captured8, align 8
+ %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 3
+ store i64 addrspace(1)* %c, i64 addrspace(1)* addrspace(5)* %block.captured9, align 8
+ %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2, i32 0, i32 4
+ store i64 %d, i64 addrspace(5)* %block.captured10, align 8
+ %tmp6 = bitcast <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> addrspace(5)* %block2 to void () addrspace(5)*
+ %tmp8 = addrspacecast void () addrspace(5)* %tmp6 to i8*
+ %tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t addrspace(5)* byval nonnull %tmp3,
+ i8* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*), i8* nonnull %tmp8) #2
ret void
}
; CHECK: define dso_local amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]]
-define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #0
+define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(1)*, i8 }> %arg) #0
!kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
entry:
- %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 3
- %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 4
+ %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(1)*, i8 }> %arg, 2
+ %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(1)*, i8 }> %arg, 3
store i8 %.fca.4.extract, i8 addrspace(1)* %.fca.3.extract, align 1
ret void
}
-declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t*, i8 addrspace(4)*) local_unnamed_addr
+declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t addrspace(5)*, i8*, i8*) local_unnamed_addr
; CHECK: define dso_local amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]]
-define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*,
+define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(1)*,
i64 addrspace(1)*, i64, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15
!kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
entry:
- %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3
- %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4
- %.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5
- %.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 6
+ %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 2
+ %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3
+ %.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4
+ %.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5
store i8 %.fca.6.extract, i8 addrspace(1)* %.fca.3.extract, align 1
store i64 %.fca.5.extract, i64 addrspace(1)* %.fca.4.extract, align 8
ret void
More information about the llvm-commits
mailing list