[llvm] f6632f1 - [AMDGPU] Fix missing lowering of LDS used in global scope.

via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 9 20:10:45 PDT 2021


Author: hsmahesha
Date: 2021-06-10T08:40:01+05:30
New Revision: f6632f11ed196bbb383357e9e7ecd1cf33e08a6a

URL: https://github.com/llvm/llvm-project/commit/f6632f11ed196bbb383357e9e7ecd1cf33e08a6a
DIFF: https://github.com/llvm/llvm-project/commit/f6632f11ed196bbb383357e9e7ecd1cf33e08a6a.diff

LOG: [AMDGPU] Fix missing lowering of LDS used in global scope.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D103431

Added: 
    llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index e3287f07aa2d8..937daea6bc25d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -171,7 +171,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
 
     // Find variables to move into new struct instance
     std::vector<GlobalVariable *> FoundLocalVars =
-        AMDGPU::findVariablesToLower(M, UsedList, F);
+        AMDGPU::findVariablesToLower(M, F);
 
     if (FoundLocalVars.empty()) {
       // No variables to rewrite, no changes made.

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
index ba3e0defbdc1d..fd704faab2ab5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -60,15 +60,34 @@ void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
   }
 }
 
-bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                            const GlobalVariable &GV, const Function *F) {
-  // Any LDS variable can be lowered by moving into the created struct
-  // Each variable so lowered is allocated in every kernel, so variables
-  // whose users are all known to be safe to lower without the transform
-  // are left unchanged.
+bool hasUserInstruction(const GlobalValue *GV) {
+  SmallPtrSet<const User *, 8> Visited;
+  SmallVector<const User *, 16> Stack(GV->users());
+
+  while (!Stack.empty()) {
+    const User *U = Stack.pop_back_val();
+
+    if (!Visited.insert(U).second)
+      continue;
+
+    if (isa<Instruction>(U))
+      return true;
+
+    append_range(Stack, U->users());
+  }
+
+  return false;
+}
+
+bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+  // We are not interested in kernel LDS lowering for module LDS itself.
+  if (F && GV.getName() == "llvm.amdgcn.module.lds")
+    return false;
+
   bool Ret = false;
   SmallPtrSet<const User *, 8> Visited;
   SmallVector<const User *, 16> Stack(GV.users());
+  SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
 
   assert(!F || isKernelCC(F));
 
@@ -76,10 +95,16 @@ bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
     const User *V = Stack.pop_back_val();
     Visited.insert(V);
 
-    if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
-      if (UsedList.contains(G)) {
-        continue;
+    if (auto *G = dyn_cast<GlobalValue>(V)) {
+      StringRef GName = G->getName();
+      if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
+        // For kernel LDS lowering, if G is not a compiler.used list, then we
+        // cannot lower the lds GV since we cannot replace the use of GV within
+        // G.
+        return false;
       }
+      GlobalUsers.insert(G);
+      continue;
     }
 
     if (auto *I = dyn_cast<Instruction>(V)) {
@@ -88,32 +113,32 @@ bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
         // Used from this kernel, we want to put it into the structure.
         Ret = true;
       } else if (!F) {
+        // For module LDS lowering, lowering is required if the user instruction
+        // is from non-kernel function.
         Ret |= !isKernelCC(UF);
       }
       continue;
     }
 
-    if (auto *E = dyn_cast<ConstantExpr>(V)) {
-      for (const User *U : E->users()) {
-        if (Visited.insert(U).second) {
-          Stack.push_back(U);
-        }
-      }
-      continue;
-    }
+    // User V should be a constant, recursively visit users of V.
+    assert(isa<Constant>(V) && "Expected a constant.");
+    append_range(Stack, V->users());
+  }
 
-    // Unknown user, conservatively lower the variable.
-    // For module LDS conservatively means place it into the module LDS struct.
-    // For kernel LDS it means lower as a standalone variable.
-    return !F;
+  if (!F && !Ret) {
+    // For module LDS lowering, we have not yet decided if we should lower GV or
+    // not. Explore all global users of GV, and check if atleast one of these
+    // global users appear as an use within an instruction (possibly nested use
+    // via constant expression), if so, then conservately lower LDS.
+    for (auto *G : GlobalUsers)
+      Ret |= hasUserInstruction(G);
   }
 
   return Ret;
 }
 
-std::vector<GlobalVariable *>
-findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                     const Function *F) {
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+                                                   const Function *F) {
   std::vector<llvm::GlobalVariable *> LocalVars;
   for (auto &GV : M.globals()) {
     if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
@@ -137,7 +162,7 @@ findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
       // dropped by the back end if not. This pass skips over it.
       continue;
     }
-    if (!shouldLowerLDSToStruct(UsedList, GV, F)) {
+    if (!shouldLowerLDSToStruct(GV, F)) {
       continue;
     }
     LocalVars.push_back(&GV);

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
index b5e2cb9f3bf79..95011ee3e6267 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -25,19 +25,19 @@ bool isKernelCC(const Function *Func);
 
 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 
+/// \returns true if a given global variable \p GV (or its global users) appear
+/// as an use within some instruction (either from kernel or from non-kernel).
+bool hasUserInstruction(const GlobalValue *GV);
+
 /// \returns true if an LDS global requres lowering to a module LDS structure
 /// if \p F is not given. If \p F is given it must be a kernel and function
 /// \returns true if an LDS global is directly used from that kernel and it
 /// is safe to replace its uses with a kernel LDS structure member.
-/// \p UsedList contains a union of llvm.used and llvm.compiler.used variables
-/// which do not count as a use.
-bool shouldLowerLDSToStruct(const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                            const GlobalVariable &GV,
+bool shouldLowerLDSToStruct(const GlobalVariable &GV,
                             const Function *F = nullptr);
 
-std::vector<GlobalVariable *>
-findVariablesToLower(Module &M, const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                     const Function *F = nullptr);
+std::vector<GlobalVariable *> findVariablesToLower(Module &M,
+                                                   const Function *F = nullptr);
 
 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
new file mode 100644
index 0000000000000..336dc0c957843
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+;.
+; Kernel LDS lowering.
+;.
+; @lds.1:  is part of @llvm.used list, and also it is used within kernel, hence it is lowered.
+; @lds.2:  is part of @llvm.compiler.used list, and also it is used within kernel, hence it is lowered.
+; @lds.3:  is used as initializer to @gptr.3, hence @lds.3 is not lowered, though it is used within kernel.
+; @lds.4:  is used as initializer to @gptr.4, hence @lds.4 is not lowered, though it is used within kernel,
+;          irrespective of the uses of @gptr.4 itself ( @gptr.4 is part of llvm.compiler.used list ).
+; @lds.5:  is part of @llvm.used list, but is not used within kernel, hence it is not lowered.
+; @lds.6:  is part of @llvm.compiler.used list, but is not used within kernel, hence it is not lowered.
+;.
+
+; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { i32, i16 }
+
+; CHECK-NOT: @lds.1
+; CHECK-NOT: @lds.2
+; CHECK: @lds.3 = addrspace(3) global i64 undef, align 8
+; CHECK: @lds.4 = addrspace(3) global float undef, align 4
+; CHECK: @lds.5 = addrspace(3) global i16 undef, align 2
+; CHECK: @lds.6 = addrspace(3) global i32 undef, align 4
+ at lds.1 = addrspace(3) global i16 undef, align 2
+ at lds.2 = addrspace(3) global i32 undef, align 4
+ at lds.3 = addrspace(3) global i64 undef, align 8
+ at lds.4 = addrspace(3) global float undef, align 4
+ at lds.5 = addrspace(3) global i16 undef, align 2
+ at lds.6 = addrspace(3) global i32 undef, align 4
+
+; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
+; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8
+ at gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
+ at gptr.4 = addrspace(1) global i64* addrspacecast (float addrspace(3)* @lds.4 to i64*), align 8
+
+; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 4
+
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.5 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.5 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.compiler.used = appending global [3 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @k0()
+; CHECK:   %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 2
+; CHECK:   %ld.lds.2 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 4
+; CHECK:   %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3, align 4
+; CHECK:   %ld.lds.4 = load float, float addrspace(3)* @lds.4, align 4
+; CHECK:   ret void
+define amdgpu_kernel void @k0() {
+  %ld.lds.1 = load i16, i16 addrspace(3)* @lds.1
+  %ld.lds.2 = load i32, i32 addrspace(3)* @lds.2
+  %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3
+  %ld.lds.4 = load float, float addrspace(3)* @lds.4
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
new file mode 100644
index 0000000000000..0ce2aba7ffde0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
@@ -0,0 +1,93 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+;.
+; @lds.1:  is aliased with @alias.to.lds.1, and @alias.to.lds.1 is used within kernel @k0.
+;          Hence, @lds.1 is lowered.
+; @lds.2:  is aliased with @alias.to.lds.2, and @alias.to.lds.2 is used within non-kernel @f0,
+;          Hence, @lds.2 is lowered.
+; @lds.3:  is used as initializer to global @gptr.3, and @gptr.3 is aliased with @alias.to.gptr.3,
+;          and @alias.to.gptr.3 is used within kernel @k1. Hence, @lds.3 is lowered.
+; @lds.4:  is used as initializer to global @gptr.4, and @gptr.4 is aliased with @alias.to.gptr.4,
+;          and @alias.to.gptr.4 is used within non-kernel @f1. Hence, @lds.4 is lowered.
+; @lds.5:  is aliased with @alias.to.lds.5, but neither @lds.5 nor @alias.to.lds.5 is used anywhere.
+;          Hence, @lds.5 is not lowered.
+; @lds.6:  is used as initializer to global @gptr.6, and @gptr.6 is aliased with @alias.to.gptr.6.
+;          But none of them are used anywhere. Hence, @lds.6 is not lowered.
+;.
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] }
+
+; CHECK-NOT: @lds.1
+; CHECK-NOT: @lds.2
+; CHECK-NOT: @lds.3
+; CHECK-NOT: @lds.4
+; CHECK: @lds.5 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 8
+; CHECK: @lds.6 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 8
+ at lds.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
+ at lds.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
+ at lds.3 = internal unnamed_addr addrspace(3) global [3 x i8] undef, align 4
+ at lds.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
+ at lds.5 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 8
+ at lds.6 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 8
+
+; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([3 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([6 x i8] addrspace(3)* @lds.6 to i64 addrspace(3)*) to i64*), align 8
+ at gptr.3 = addrspace(1) global i64* addrspacecast ([3 x i8] addrspace(3)* @lds.3 to i64*), align 8
+ at gptr.4 = addrspace(1) global i64* addrspacecast ([4 x i8] addrspace(3)* @lds.4 to i64*), align 8
+ at gptr.6 = addrspace(1) global i64* addrspacecast ([6 x i8] addrspace(3)* @lds.6 to i64*), align 8
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
+; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
+
+; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 4)
+; CHECK: @alias.to.lds.2 = alias [2 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3)
+; CHECK: @alias.to.gptr.3 = alias i64*, i64* addrspace(1)* @gptr.3
+; CHECK: @alias.to.gptr.4 = alias i64*, i64* addrspace(1)* @gptr.4
+; CHECK: @alias.to.lds.5 = alias [5 x i8], [5 x i8] addrspace(3)* @lds.5
+; CHECK: @alias.to.gptr.6 = alias i64*, i64* addrspace(1)* @gptr.6
+ at alias.to.lds.1 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.1
+ at alias.to.lds.2 = alias [2 x i8], [2 x i8] addrspace(3)* @lds.2
+ at alias.to.gptr.3 = alias i64*, i64* addrspace(1)* @gptr.3
+ at alias.to.gptr.4 = alias i64*, i64* addrspace(1)* @gptr.4
+ at alias.to.lds.5 = alias [5 x i8], [5 x i8] addrspace(3)* @lds.5
+ at alias.to.gptr.6 = alias i64*, i64* addrspace(1)* @gptr.6
+
+; CHECK-LABEL: @f1
+; CHECK:   %ld = load i64*, i64* addrspace(1)* @alias.to.gptr.4, align 8
+; CHECK:   ret void
+define void @f1() {
+  %ld = load i64*, i64* addrspace(1)* @alias.to.gptr.4
+  ret void
+}
+
+; CHECK-LABEL: @f0
+; CHECK:   %bc = bitcast [2 x i8] addrspace(3)* @alias.to.lds.2 to i8 addrspace(3)*
+; CHECK:   store i8 1, i8 addrspace(3)* %bc, align 2
+; CHECK:   ret void
+define void @f0() {
+  %bc = bitcast [2 x i8] addrspace(3)* @alias.to.lds.2 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %bc, align 2
+  ret void
+}
+
+; CHECK-LABEL: @k1
+; CHECK-LABEL:   call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-LABEL:   %ld = load i64*, i64* addrspace(1)* @alias.to.gptr.3, align 8
+; CHECK-LABEL:   ret void
+define amdgpu_kernel void @k1() {
+  %ld = load i64*, i64* addrspace(1)* @alias.to.gptr.3
+  ret void
+}
+
+; CHECK-LABEL: @k0
+; CHECK-LABEL:   call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-LABEL:   %bc = bitcast [1 x i8] addrspace(3)* @alias.to.lds.1 to i8 addrspace(3)*
+; CHECK-LABEL:   store i8 1, i8 addrspace(3)* %bc, align 1
+; CHECK-LABEL:   ret void
+define amdgpu_kernel void @k0() {
+  %bc = bitcast [1 x i8] addrspace(3)* @alias.to.lds.1 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %bc, align 1
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
new file mode 100644
index 0000000000000..7985ecff40444
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
@@ -0,0 +1,88 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+;.
+; @lds.1:  is part of @llvm.used list, and is no-where used. Hence it is not lowered.
+; @lds.2:  is part of @llvm.compiler.used list, and is no-where used. Hence it is not lowered.
+; @lds.3:  is used as initializer to @gptr.3, and is no-where used. @gptr.3 itself is also not
+;          used anywhere else, hence @lds.3 is not lowered.
+; @lds.4:  is used as initializer to @gptr.4, and is no-where used. @gptr.4 is part of
+;          @llvm.compiler.used list, but is no-where else used. hence @lds.4 is not lowered.
+;
+; @lds.5:  is used as initializer to @gptr.5, and is no-where used. @gptr.5 is part of
+;          @llvm.compiler.used list, but is also used within kernel @k0. Hence @lds.5 is lowered.
+; @lds.6:  is used as initializer to @gptr.6, and is no-where used. @gptr.6 is part of
+;          @llvm.compiler.used list, but is also used within non-kernel function @f0. Hence @lds.6 is lowered.
+; @lds.7:  is used as initializer to @gptr.7, and is no-where used. @gptr.7 is used as initializer to @gptr.8,
+;          and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered.
+;.
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] }
+
+; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
+; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
+; CHECK: @lds.3 = addrspace(3) global i64 undef, align 8
+; CHECK: @lds.4 = addrspace(3) global float undef, align 4
+; CHECK-NOT: @lds.5
+; CHECK-NOT: @lds.6
+; CHECK-NOT: @lds.7
+ at lds.1 = addrspace(3) global i16 undef, align 2
+ at lds.2 = addrspace(3) global i32 undef, align 4
+ at lds.3 = addrspace(3) global i64 undef, align 8
+ at lds.4 = addrspace(3) global float undef, align 4
+ at lds.5 = addrspace(3) global [1 x float] undef, align 4
+ at lds.6 = addrspace(3) global [2 x float] undef, align 8
+ at lds.7 = addrspace(3) global [3 x float] undef, align 16
+
+; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
+; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8
+ at gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
+ at gptr.4 = addrspace(1) global i64* addrspacecast (float addrspace(3)* @lds.4 to i64*), align 8
+ at gptr.5 = addrspace(1) global i64* addrspacecast ([1 x float] addrspace(3)* @lds.5 to i64*), align 8
+ at gptr.6 = addrspace(1) global i64* addrspacecast ([2 x float] addrspace(3)* @lds.6 to i64*), align 8
+ at gptr.7 = addrspace(1) global i64* addrspacecast ([3 x float] addrspace(3)* @lds.7 to i64*), align 8
+ at gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8
+
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 16
+; CHECK: @llvm.compiler.used = appending global [5 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.compiler.used = appending global [4 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @f1()
+; CHECK:   %ld = load i64**, i64** addrspace(1)* @gptr.8, align 8
+; CHECK:   ret void
+define void @f1() {
+  %ld = load i64**, i64** addrspace(1)* @gptr.8
+  ret void
+}
+
+; CHECK-LABEL: @f0()
+; CHECK:   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32
+; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+; CHECK:   ret void
+define void @f0() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.6 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @k0()
+; CHECK:   call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK:   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32
+; CHECK: addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+; CHECK:   ret void
+define amdgpu_kernel void @k0() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.5 to i32 addrspace(1)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @k1()
+; CHECK:   call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK:   ret void
+define amdgpu_kernel void @k1() {
+  ret void
+}


        


More information about the llvm-commits mailing list