[llvm] 05289df - [AMDGPU] Handle constant LDS uses from different kernels

Mon Jun 7 15:44:01 PDT 2021

Author: Stanislav Mekhanoshin
Date: 2021-06-07T15:39:08-07:00
New Revision: 05289dfb62461b73654c5fd64b1ef044a88ee027

URL: https://github.com/llvm/llvm-project/commit/05289dfb62461b73654c5fd64b1ef044a88ee027
DIFF: https://github.com/llvm/llvm-project/commit/05289dfb62461b73654c5fd64b1ef044a88ee027.diff

LOG: [AMDGPU] Handle constant LDS uses from different kernels

This allows to lower an LDS variable into a kernel structure
even if there is a constant expression used from different
kernels.

Differential Revision: https://reviews.llvm.org/D103655

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/ds_write2.ll
    llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 8e3ce775b5819..e3287f07aa2d8 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -292,8 +292,18 @@ class AMDGPULowerModuleLDS : public ModulePass {
       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
       Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
       if (F) {
+        // Replace all constant uses with instructions if they belong to the
+        // current kernel.
+        for (User *U : make_early_inc_range(GV->users())) {
+          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+            AMDGPU::replaceConstantUsesInFunction(C, F);
+        }
+
+        GV->removeDeadConstantUsers();
+
         GV->replaceUsesWithIf(GEP, [F](Use &U) {
-          return AMDGPU::isUsedOnlyFromFunction(U.getUser(), F);
+          Instruction *I = dyn_cast<Instruction>(U.getUser());
+          return I && I->getFunction() == F;
         });
       } else {
         GV->replaceAllUsesWith(GEP);

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
index 8bccf0e5d03e7..ba3e0defbdc1d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -12,7 +12,9 @@
 
 #include "AMDGPULDSUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/ReplaceConstant.h"
 
 using namespace llvm;
 
@@ -29,17 +31,33 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
                                        GV->getValueType());
 }
 
-bool isUsedOnlyFromFunction(const User *U, const Function *F) {
-  if (auto *I = dyn_cast<Instruction>(U)) {
-    return I->getFunction() == F;
-  }
+static void collectFunctionUses(User *U, const Function *F,
+                                SetVector<Instruction *> &InstUsers) {
+  SmallVector<User *> Stack{U};
+
+  while (!Stack.empty()) {
+    U = Stack.pop_back_val();
+
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      if (I->getFunction() == F)
+        InstUsers.insert(I);
+      continue;
+    }
 
-  if (isa<ConstantExpr>(U)) {
-    return all_of(U->users(),
-                  [F](const User *U) { return isUsedOnlyFromFunction(U, F); });
+    if (!isa<ConstantExpr>(U))
+      continue;
+
+    append_range(Stack, U->users());
   }
+}
 
-  return false;
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
+  SetVector<Instruction *> InstUsers;
+
+  collectFunctionUses(C, F, InstUsers);
+  for (Instruction *I : InstUsers) {
+    convertConstantExprsToInstructions(I, C);
+  }
 }
 
 bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
@@ -76,14 +94,6 @@ bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
     }
 
     if (auto *E = dyn_cast<ConstantExpr>(V)) {
-      if (F) {
-        // Any use which does not end up an instruction disqualifies a
-        // variable to be put into a kernel's LDS structure because later
-        // we will need to replace only this kernel's uses for which we
-        // need to identify a using function.
-        if (!isUsedOnlyFromFunction(E, F))
-          return false;
-      }
       for (const User *U : E->users()) {
         if (Visited.insert(U).second) {
           Stack.push_back(U);

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
index adcaa34112524..b5e2cb9f3bf79 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -17,6 +17,8 @@
 
 namespace llvm {
 
+class ConstantExpr;
+
 namespace AMDGPU {
 
 bool isKernelCC(const Function *Func);
@@ -39,9 +41,8 @@ findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
 
 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
 
-/// \returns true if all uses of \p U end up in a function \p F.
-bool isUsedOnlyFromFunction(const User *U, const Function *F);
-
+/// Replace all uses of constant \p C with instructions in \p F.
+void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
 } // end namespace AMDGPU
 
 } // end namespace llvm

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 21c9d2ec3046e..e1270ba4f72ec 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -947,7 +947,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; CI-NEXT:    ds_read_b64 v[0:1], v0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
@@ -959,7 +959,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
 ; GFX9-LABEL: load_constant_adjacent_offsets:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index a57b8a62e194e..71dc7f2ff44fd 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -774,21 +774,19 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
 define amdgpu_kernel void @store_constant_adjacent_offsets() {
 ; CI-LABEL: store_constant_adjacent_offsets:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_movk_i32 s0, 0x7b
-; CI-NEXT:    v_mov_b32_e32 v0, 0
-; CI-NEXT:    v_mov_b32_e32 v1, s0
-; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CI-NEXT:    v_mov_b32_e32 v1, v0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
+; CI-NEXT:    ds_write_b64 v2, v[0:1]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: store_constant_adjacent_offsets:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s0, 0x7b
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX9-NEXT:    s_endpgm
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
index 7ca6eaad61b26..99a7c2127a19c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@@ -1,19 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --check-globals
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
-; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 }
-; CHECK-NOT: %llvm.amdgcn.kernel.k4.lds.t
-
 @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
 
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 }
+; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i8] }
+
 ; Use constant from 
diff erent kernels
 ;.
-; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
+; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 2
+; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 2
 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
+; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16
+; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 2
 ;.
 define amdgpu_kernel void @k0(i64 %x) {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
+; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), i32 0, i32 0
+; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
 ; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -24,7 +33,9 @@ define amdgpu_kernel void @k0(i64 %x) {
 
 define amdgpu_kernel void @k1(i64 %x) {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
+; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
+; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
 ; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -56,10 +67,15 @@ define amdgpu_kernel void @k2(i64 %x) {
 ; Use constant twice from the same kernel but a 
diff erent other constant.
 define amdgpu_kernel void @k3(i64 %x) {
 ; CHECK-LABEL: @k3(
-; CHECK-NEXT:    %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 16) to i64 addrspace(3)*) to i64*
+; CHECK-NEXT:    %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 16
+; CHECK-NEXT:    %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)*
+; CHECK-NEXT:    %ptr1 = addrspacecast i64 addrspace(3)* %2 to i64*
 ; CHECK-NEXT:    store i64 1, i64* %ptr1, align 1
-; CHECK-NEXT:    %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 24) to i64 addrspace(3)*) to i64*
+; CHECK-NEXT:    %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24
+; CHECK-NEXT:    %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)*
+; CHECK-NEXT:    %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64*
 ; CHECK-NEXT:    store i64 2, i64* %ptr2, align 1
+; CHECK-NEXT:    ret void
 ;
   %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
   store i64 1, i64* %ptr1, align 1
@@ -69,10 +85,11 @@ define amdgpu_kernel void @k3(i64 %x) {
 }
 
 ; @lds.1 is used from constant expressions in 
diff erent kernels.
-; Make sure we do not create a structure for it as we cannot handle it yet.
 define amdgpu_kernel void @k4(i64 %x) {
 ; CHECK-LABEL: @k4(
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x
+; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32 0), i32 0, i32 0
+; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
 ; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
index 43ba68aa01ebd..948c07dcc22ca 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -37,9 +37,19 @@ entry:
 
 ; CHECK-LABEL: @timestwo()
 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
+; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
+; CHECK: %3 = ptrtoint i32* %2 to i64
+; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), %3
+; CHECK: %5 = inttoptr i64 %4 to i32*
+; CHECK: %ld = load i32, i32* %5, align 4
 ; CHECK: %mul = mul i32 %ld, 2
-; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
+; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
+; CHECK: %8 = ptrtoint i32* %7 to i64
+; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)
+; CHECK: %10 = inttoptr i64 %9 to i32*
+; CHECK: store i32 %mul, i32* %10, align 4
 define amdgpu_kernel void @timestwo() {
   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
   %mul = mul i32 %ld, 2