[llvm] cdb9738 - [amdgpu] Expand all ConstantExpr users of LDS variables in instructions

Tue Sep 13 23:56:01 PDT 2022

Author: Jon Chesterfield
Date: 2022-09-14T07:55:46+01:00
New Revision: cdb9738963a1584d6530bcf7b102423f54a6bdb1

URL: https://github.com/llvm/llvm-project/commit/cdb9738963a1584d6530bcf7b102423f54a6bdb1
DIFF: https://github.com/llvm/llvm-project/commit/cdb9738963a1584d6530bcf7b102423f54a6bdb1.diff

LOG: [amdgpu] Expand all ConstantExpr users of LDS variables in instructions

Bug noted in D112717 can be sidestepped with this change.

Expanding all ConstantExpr involved with LDS up front makes the variable specialisation simpler. Excludes ConstantExpr that don't access LDS to avoid disturbing codegen elsewhere.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D133422

Added: 
    llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
    llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 243d9ce19fd2f..f0e32f456ee39 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -32,6 +32,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -144,6 +145,79 @@ class AMDGPULowerModuleLDS : public ModulePass {
                        "");
   }
 
+  static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
+    // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
+    // global may have uses from multiple 
diff erent functions as a result.
+    // This pass specialises LDS variables with respect to the kernel that
+    // allocates them.
+
+    // This is semantically equivalent to:
+    // for (auto &F : M.functions())
+    //   for (auto &BB : F)
+    //     for (auto &I : BB)
+    //       for (Use &Op : I.operands())
+    //         if (constantExprUsesLDS(Op))
+    //           replaceConstantExprInFunction(I, Op);
+
+    bool Changed = false;
+
+    // Find all ConstantExpr that are direct users of an LDS global
+    SmallVector<ConstantExpr *> Stack;
+    for (auto &GV : M.globals())
+      if (AMDGPU::isLDSVariableToLower(GV))
+        for (User *U : GV.users())
+          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+            Stack.push_back(C);
+
+    // Expand to include constexpr users of direct users
+    SetVector<ConstantExpr *> ConstExprUsersOfLDS;
+    while (!Stack.empty()) {
+      ConstantExpr *V = Stack.pop_back_val();
+      if (ConstExprUsersOfLDS.contains(V))
+        continue;
+
+      ConstExprUsersOfLDS.insert(V);
+
+      for (auto *Nested : V->users())
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested))
+          Stack.push_back(CE);
+    }
+
+    // Find all instructions that use any of the ConstExpr users of LDS
+    SetVector<Instruction *> InstructionWorklist;
+    for (ConstantExpr *CE : ConstExprUsersOfLDS)
+      for (User *U : CE->users())
+        if (auto *I = dyn_cast<Instruction>(U))
+          InstructionWorklist.insert(I);
+
+    // Replace those ConstExpr operands with instructions
+    while (!InstructionWorklist.empty()) {
+      Instruction *I = InstructionWorklist.pop_back_val();
+      for (Use &U : I->operands()) {
+
+        auto *BI = I;
+        if (auto *Phi = dyn_cast<PHINode>(I)) {
+          BasicBlock *BB = Phi->getIncomingBlock(U);
+          BasicBlock::iterator It = BB->getFirstInsertionPt();
+          assert(It != BB->end() && "Unexpected empty basic block");
+          BI = &(*(It));
+        }
+
+        if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) {
+          if (ConstExprUsersOfLDS.contains(C)) {
+            Changed = true;
+            Instruction *NI = C->getAsInstruction(BI);
+            InstructionWorklist.insert(NI);
+            U.set(NI);
+            C->removeDeadConstantUsers();
+          }
+        }
+      }
+    }
+
+    return Changed;
+  }
+
 public:
   static char ID;
 
@@ -156,6 +230,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
 
+    Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
     // Move variables used by functions into amdgcn.module.lds
     std::vector<GlobalVariable *> ModuleScopeVariables =
         AMDGPU::findLDSVariablesToLower(M, nullptr);
@@ -216,16 +292,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
       std::vector<GlobalVariable *> KernelUsedVariables =
           AMDGPU::findLDSVariablesToLower(M, &F);
 
-      // Replace all constant uses with instructions if they belong to the
-      // current kernel. Unnecessary, removing will cause test churn.
-      for (GlobalVariable *GV : KernelUsedVariables) {
-        for (User *U : make_early_inc_range(GV->users())) {
-          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
-            AMDGPU::replaceConstantUsesInFunction(C, &F);
-        }
-        GV->removeDeadConstantUsers();
-      }
-
       if (!KernelUsedVariables.empty()) {
         std::string VarName =
             (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
@@ -245,9 +311,12 @@ class AMDGPULowerModuleLDS : public ModulePass {
     }
 
     for (auto &GV : make_early_inc_range(M.globals()))
-      if (AMDGPU::isLDSVariableToLower(GV) && GV.use_empty())
-        GV.eraseFromParent();
-    
+      if (AMDGPU::isLDSVariableToLower(GV)) {
+        GV.removeDeadConstantUsers();
+        if (GV.use_empty())
+          GV.eraseFromParent();
+      }
+
     return Changed;
   }
 
@@ -361,8 +430,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
 
     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
 
-    Align StructAlign =
-        AMDGPU::getAlign(DL, LocalVars[0]);
+    Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
 
     GlobalVariable *SGV = new GlobalVariable(
         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index 3211c2dbabc3a..39e7536d62e09 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -32,35 +32,6 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
                                        GV->getValueType());
 }
 
-static void collectFunctionUses(User *U, const Function *F,
-                                SetVector<Instruction *> &InstUsers) {
-  SmallVector<User *> Stack{U};
-
-  while (!Stack.empty()) {
-    U = Stack.pop_back_val();
-
-    if (auto *I = dyn_cast<Instruction>(U)) {
-      if (I->getFunction() == F)
-        InstUsers.insert(I);
-      continue;
-    }
-
-    if (!isa<ConstantExpr>(U))
-      continue;
-
-    append_range(Stack, U->users());
-  }
-}
-
-void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
-  SetVector<Instruction *> InstUsers;
-
-  collectFunctionUses(C, F, InstUsers);
-  for (Instruction *I : InstUsers) {
-    convertConstantExprsToInstructions(I, C);
-  }
-}
-
 static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
                                    const Function *F) {
   // We are not interested in kernel LDS lowering for module LDS itself.

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 53fd4cf335ee0..9dc712b9a334b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -33,9 +33,6 @@ bool isLDSVariableToLower(const GlobalVariable &GV);
 std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M,
                                                       const Function *F);
 
-/// Replace all uses of constant \p C with instructions in \p F.
-void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
-
 /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
 /// if this is actually a memory update or an artificial clobber to facilitate
 /// ordering constraints.

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
index ae55b1a5521d0..5a134f7717bdd 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@@ -107,9 +107,11 @@ define amdgpu_kernel void @k4(i64 %x) {
 ; Multiple constexpr use in a same instruction.
 define amdgpu_kernel void @k5() {
 ; CHECK-LABEL: @k5(
-; CHECK-NEXT: %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]*
-; CHECK-NEXT: %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0
-; CHECK-NEXT: call void undef(i32* %2, i32* %2)
+; CHECK-NEXT:  %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]*
+; CHECK-NEXT:  %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0
+; CHECK-NEXT:  %3 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]*
+; CHECK-NEXT:  %4 = getelementptr inbounds [505 x i32], [505 x i32]* %3, i64 0, i64 0
+; CHECK-NEXT:  call void undef(i32* %2, i32* %4)
 ;
   call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0))
   ret void
@@ -119,13 +121,15 @@ define amdgpu_kernel void @k5() {
 
 ; Both the *value* and *pointer* operands of store instruction are constant expressions, and
 ; both of these constant expression paths use same lds - @lds.5. Hence both of these constant
-; expression operands of store should be replaced by corresponding instruction sequence.
+; expression operands of store should be replaced by equivalent instruction sequences.
 define amdgpu_kernel void @k6() {
 ; CHECK-LABEL: @k6(
-; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2
-; CHECK-NEXT: %2 = ptrtoint i32 addrspace(3)* %1 to i32
-; CHECK-NEXT: store i32 %2, i32 addrspace(3)* %1, align 8
-; CHECK-NEXT: ret void
+
+; CHECK-NEXT:  %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2
+; CHECK-NEXT:  %2 = ptrtoint i32 addrspace(3)* %1 to i32
+; CHECK-NEXT:  %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2
+; CHECK-NEXT:  store i32 %2, i32 addrspace(3)* %3, align 8
+; CHECK-NEXT:  ret void
 ;
   store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 8d67a21f31036..6fac9360b6d73 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -18,9 +18,15 @@ $_f2 = comdat any
 
 ; CHECK-LABEL: @test
 ; CHECK: store i8 3, i8 addrspace(3)* %0, align 4, !alias.scope !0, !noalias !3
-; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %2, i8 addrspace(3)* noundef align 1 dereferenceable(3) %1, i64 3, i1 false), !alias.scope !5, !noalias !6
+; CHECK: %1 = getelementptr
+; CHECK: %2 = getelementptr
+; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %1, i8 addrspace(3)* noundef align 1 dereferenceable(3) %2, i64 3, i1 false), !alias.scope !5, !noalias !6
 ; CHECK: %4 = load i8, i8 addrspace(3)* %3, align 4, !alias.scope !3, !noalias !0
-; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %7, i8 addrspace(3)* noundef align 1 dereferenceable(3) %6, i64 3, i1 false), !alias.scope !5, !noalias !6
+; CHECK: %5 = getelementptr
+; CHECK: %6 = getelementptr
+; CHECK: %7 = getelementptr
+; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %6, i8 addrspace(3)* noundef align 1 dereferenceable(3) %7, i64 3, i1 false), !alias.scope !5, !noalias !6
+; CHECK: %8 = getelementptr
 ; CHECK: %9 = load i8, i8 addrspace(3)* %8, align 4, !alias.scope !3, !noalias !0
 
 define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce) local_unnamed_addr #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll
new file mode 100644
index 0000000000000..097641b0bc9c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr-phi.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+ at var = addrspace(3) global i32 undef, align 4
+
+; Regression test. Duplicate constantexpr in phi nodes shall not emit broken IR
+define amdgpu_kernel void @func(i32 %c) {
+; CHECK-LABEL: @func(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[C:%.*]], label [[RETURN:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.func.lds to ptr
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.func.lds to ptr
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP:%.*]] = phi ptr [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i32 %c, label %return [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  br label %bb2
+
+bb1:
+  br label %bb2
+
+bb2:
+  %tmp = phi ptr [ addrspacecast (ptr addrspace(3) @var to ptr), %bb0 ], [ addrspacecast (ptr addrspace(3) @var to ptr), %bb1 ]
+  br label %return
+
+return:
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
index 37334b2ac81e4..cffaf0d9295df 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -20,7 +20,16 @@
 ; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4
 
 ; CHECK-LABEL: @get_func()
-; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK:       %0 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i32 addrspace(3)*
+; CHECK:       %1 = addrspacecast i32 addrspace(3)* %0 to i32*
+; CHECK:       %2 = ptrtoint i32* %1 to i64
+; CHECK:       %3 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i32 addrspace(3)*
+; CHECK:       %4 = addrspacecast i32 addrspace(3)* %3 to i32*
+; CHECK:       %5 = ptrtoint i32* %4 to i64
+; CHECK:       %6 = add i64 %2, %5
+; CHECK:       %7 = inttoptr i64 %6 to i32*
+; CHECK:       %8 = load i32, i32* %7, align 4
+; CHECK:       ret i32 %8
 define i32 @get_func() local_unnamed_addr #0 {
 entry:
   %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -28,7 +37,16 @@ entry:
 }
 
 ; CHECK-LABEL: @set_func(i32 %x)
-; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK:      %0 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*
+; CHECK:      %1 = addrspacecast i32 addrspace(3)* %0 to i32*
+; CHECK:      %2 = ptrtoint i32* %1 to i64
+; CHECK:      %3 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*
+; CHECK:      %4 = addrspacecast i32 addrspace(3)* %3 to i32*
+; CHECK:      %5 = ptrtoint i32* %4 to i64
+; CHECK:      %6 = add i64 %2, %5
+; CHECK:      %7 = inttoptr i64 %6 to i32*
+; CHECK:      store i32 %x, i32* %7, align 4
+; CHECK:      ret void
 define void @set_func(i32 %x) local_unnamed_addr #1 {
 entry:
   store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -37,19 +55,28 @@ entry:
 
 ; CHECK-LABEL: @timestwo() #0
 ; CHECK-NOT: call void @llvm.donothing()
-; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
-; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
-; CHECK: %3 = ptrtoint i32* %2 to i64
-; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3
-; CHECK: %5 = inttoptr i64 %4 to i32*
-; CHECK: %ld = load i32, i32* %5, align 4
-; CHECK: %mul = mul i32 %ld, 2
-; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
-; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
-; CHECK: %8 = ptrtoint i32* %7 to i64
-; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)
-; CHECK: %10 = inttoptr i64 %9 to i32*
-; CHECK: store i32 %mul, i32* %10, align 4
+
+
+; CHECK:      %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*
+; CHECK:      %2 = addrspacecast i32 addrspace(3)* %1 to i32*
+; CHECK:      %3 = ptrtoint i32* %2 to i64
+; CHECK:      %4 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
+; CHECK:      %5 = addrspacecast i32 addrspace(3)* %4 to i32*
+; CHECK:      %6 = ptrtoint i32* %5 to i64
+; CHECK:      %7 = add i64 %3, %6
+; CHECK:      %8 = inttoptr i64 %7 to i32*
+; CHECK:      %ld = load i32, i32* %8, align 4
+; CHECK:      %mul = mul i32 %ld, 2
+; CHECK:      %9 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
+; CHECK:      %10 = addrspacecast i32 addrspace(3)* %9 to i32*
+; CHECK:      %11 = ptrtoint i32* %10 to i64
+; CHECK:      %12 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*
+; CHECK:      %13 = addrspacecast i32 addrspace(3)* %12 to i32*
+; CHECK:      %14 = ptrtoint i32* %13 to i64
+; CHECK:      %15 = add i64 %11, %14
+; CHECK:      %16 = inttoptr i64 %15 to i32*
+; CHECK:      store i32 %mul, i32* %16, align 4
+; CHECK:      ret void
 define amdgpu_kernel void @timestwo() {
   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
   %mul = mul i32 %ld, 2