[llvm] r329815 - [AMDGPU] Fix lowering enqueue_kernel

Yaxun Liu via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 11 07:46:16 PDT 2018


Author: yaxunl
Date: Wed Apr 11 07:46:15 2018
New Revision: 329815

URL: http://llvm.org/viewvc/llvm-project?rev=329815&view=rev
Log:
[AMDGPU] Fix lowering enqueue_kernel

Two issues were fixed:

runtime has difficulty to allocate memory for an external symbol of a
kernel and set the address of the external symbol, therefore make the runtime
handle of an enqueued kernel an ordinary global variable. Runtime only needs
to store the address of the loaded kernel to the handle and has verified
that this approach works.

handle the situation where __enqueue_kernel* gets inlined therefore
the enqueued kernel may be used through a constant expr instead
of an instruction.

Differential Revision: https://reviews.llvm.org/D45187

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp?rev=329815&r1=329814&r2=329815&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp Wed Apr 11 07:46:15 2018
@@ -81,14 +81,27 @@ static void collectCallers(Function *F,
   for (auto U : F->users()) {
     if (auto *CI = dyn_cast<CallInst>(&*U)) {
       auto *Caller = CI->getParent()->getParent();
-      if (Callers.count(Caller))
-        continue;
-      Callers.insert(Caller);
-      collectCallers(Caller, Callers);
+      if (Callers.insert(Caller).second)
+        collectCallers(Caller, Callers);
     }
   }
 }
 
+/// If \p U is instruction or constant, collect functions which directly or
+/// indirectly use it.
+static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
+  if (auto *I = dyn_cast<Instruction>(U)) {
+    auto *F = I->getParent()->getParent();
+    if (Funcs.insert(F).second)
+      collectCallers(F, Funcs);
+    return;
+  }
+  if (!isa<Constant>(U))
+    return;
+  for (auto UU : U->users())
+    collectFunctionUsers(&*UU, Funcs);
+}
+
 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
   DenseSet<Function *> Callers;
   auto &C = M.getContext();
@@ -101,32 +114,28 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::
                                    M.getDataLayout());
         F.setName(Name);
       }
+      DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
       auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
+      auto T = Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS);
       auto *GV = new GlobalVariable(
-          M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
-          /*IsConstant=*/true, GlobalValue::ExternalLinkage,
-          /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
-          GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
-          /*IsExternallyInitialized=*/true);
+          M, T,
+          /*IsConstant=*/false, GlobalValue::ExternalLinkage,
+          /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
+          /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+          AMDGPUAS::GLOBAL_ADDRESS,
+          /*IsExternallyInitialized=*/false);
       DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
 
       for (auto U : F.users()) {
-        if (!isa<ConstantExpr>(&*U))
+        auto *UU = &*U;
+        if (!isa<ConstantExpr>(UU))
           continue;
-        auto *BitCast = cast<ConstantExpr>(&*U);
+        collectFunctionUsers(UU, Callers);
+        auto *BitCast = cast<ConstantExpr>(UU);
         auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
         BitCast->replaceAllUsesWith(NewPtr);
         F.addFnAttr("runtime-handle", RuntimeHandle);
         F.setLinkage(GlobalValue::ExternalLinkage);
-
-        // Collect direct or indirect callers of enqueue_kernel.
-        for (auto U : NewPtr->users()) {
-          if (auto *I = dyn_cast<Instruction>(&*U)) {
-            auto *F = I->getParent()->getParent();
-            Callers.insert(F);
-            collectCallers(F, Callers);
-          }
-        }
         Changed = true;
       }
     }
@@ -136,6 +145,7 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::
     if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
       continue;
     F->addFnAttr("calls-enqueue-kernel");
+    DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
   }
   return Changed;
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll?rev=329815&r1=329814&r2=329815&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/enqueue-kernel.ll Wed Apr 11 07:46:15 2018
@@ -1,9 +1,9 @@
 ; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
 
-; CHECK: @__test_block_invoke_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
-; CHECK: @__test_block_invoke_2_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
-; CHECK: @__amdgpu_enqueued_kernel.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
-; CHECK: @__amdgpu_enqueued_kernel.1.runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+; CHECK: @__test_block_invoke_kernel.runtime_handle = addrspace(1) global i8 addrspace(1)* null
+; CHECK: @__test_block_invoke_2_kernel.runtime_handle = addrspace(1) global i8 addrspace(1)* null
+; CHECK: @__amdgpu_enqueued_kernel.runtime_handle = addrspace(1) global i8 addrspace(1)* null
+; CHECK: @__amdgpu_enqueued_kernel.1.runtime_handle = addrspace(1) global i8 addrspace(1)* null
 
 %struct.ndrange_t = type { i32 }
 %opencl.queue_t = type opaque
@@ -80,6 +80,19 @@ entry:
   ret void
 }
 
+; __enqueue_kernel* functions may get inlined
+; CHECK-LABEL: define amdgpu_kernel void @inlined_caller
+; CHECK-SAME: #[[AT_CALLER]]
+; CHECK-NOT: @__test_block_invoke_kernel
+; CHECK: load i64, i64 addrspace(1)* bitcast (i8 addrspace(1)* addrspace(1)* @__test_block_invoke_kernel.runtime_handle to i64 addrspace(1)*)
+define amdgpu_kernel void @inlined_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+  !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %tmp = load i64, i64 addrspace(1)* addrspacecast (i64* bitcast (void (<{ i32, i32, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i64*) to i64 addrspace(1)*)
+  store i64 %tmp, i64 addrspace(1)* %c
+  ret void
+}
+
 ; CHECK-LABEL: define dso_local amdgpu_kernel void @__test_block_invoke_kernel
 ; CHECK-SAME: #[[AT1:[0-9]+]]
 define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(1)*, i8 }> %arg) #0




More information about the llvm-commits mailing list