[llvm] [AMDGPU] Fix SP calculations considering growing up stack for dynamic alloca (PR #119168)

via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 11 02:34:03 PST 2024


https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/119168

>From c46f7b6270e8004ecbe4997e0d829fa7fa1230ba Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sat, 7 Dec 2024 16:27:59 +0530
Subject: [PATCH] refactoring code to adjust for growing up stack and returning
 correct start address

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  31 +-
 .../CodeGen/AMDGPU/non-entry-alloca-mir.ll    | 449 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll  |  64 ++-
 3 files changed, 494 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/non-entry-alloca-mir.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fc8bbb154d035d..4455488c055e70 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4002,8 +4002,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          InVals, /*IsThisReturn=*/false, SDValue());
 }
 
-// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for applying the wave size scale to the increment amount.
+// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for stack growth direction(default: downwards, AMDGPU: upwards) and
+// applying the wave size scale to the increment amount.
 SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
                                                       SelectionDAG &DAG) const {
   const MachineFunction &MF = DAG.getMachineFunction();
@@ -4023,10 +4024,20 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   SDValue Size = Tmp2.getOperand(1);
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
-  Chain = SP.getValue(1);
+  SDValue SPOld = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SPOld.getValue(1);
   MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
+  Align StackAlign = TFL->getStackAlign();
+  if (Alignment && *Alignment > StackAlign) {
+    SDValue ScaledAlignment = DAG.getSignedConstant((uint64_t)Alignment->value()
+                                  << Subtarget->getWavefrontSizeLog2(),
+                                  dl, VT);
+    SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, ScaledAlignment,
+                                         DAG.getConstant(1, dl, VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask);
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, ScaledAlignment);
+  }
   unsigned Opc =
       TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp
           ? ISD::ADD
@@ -4036,20 +4047,12 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
       ISD::SHL, dl, VT, Size,
       DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
 
-  Align StackAlign = TFL->getStackAlign();
-  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
-  if (Alignment && *Alignment > StackAlign) {
-    Tmp1 = DAG.getNode(
-        ISD::AND, dl, VT, Tmp1,
-        DAG.getSignedConstant(-(uint64_t)Alignment->value()
-                                  << Subtarget->getWavefrontSizeLog2(),
-                              dl, VT));
-  }
+  Tmp1 = DAG.getNode(Opc, dl, VT, SPOld, ScaledSize); // Value
 
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
   Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
 
-  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+  return DAG.getMergeValues({SPOld, Tmp2}, dl);
 }
 
 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca-mir.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca-mir.ll
new file mode 100644
index 00000000000000..a10fe25dca3e26
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca-mir.ll
@@ -0,0 +1,449 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN:  llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=MUBUF %s
+; RUN:  llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=MUBUF-V5 %s
+; RUN:  llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel < %s -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF %s
+; RUN:  llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel < %s -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF %s
+; RUN:  llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel < %s -mattr=+enable-flat-scratch | FileCheck -check-prefixes=FLATSCR %s
+; RUN:  llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=amdgpu-isel < %s -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=FLATSCR %s
+
+define amdgpu_kernel void @non_entry_block_alloca(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
+  ; MUBUF-LABEL: name: non_entry_block_alloca
+  ; MUBUF: bb.0.entry:
+  ; MUBUF-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; MUBUF-NEXT:   liveins: $sgpr8_sgpr9
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; MUBUF-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; MUBUF-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; MUBUF-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; MUBUF-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.1.bb.0:
+  ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
+  ; MUBUF-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; MUBUF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $private_rsrc_reg, [[COPY1]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.2.bb.1:
+  ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
+  ; MUBUF-V5-LABEL: name: non_entry_block_alloca
+  ; MUBUF-V5: bb.0.entry:
+  ; MUBUF-V5-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; MUBUF-V5-NEXT:   liveins: $sgpr8_sgpr9
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; MUBUF-V5-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-V5-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; MUBUF-V5-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; MUBUF-V5-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.1.bb.0:
+  ; MUBUF-V5-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
+  ; MUBUF-V5-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $private_rsrc_reg, [[COPY1]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.2.bb.1:
+  ; MUBUF-V5-NEXT:   S_ENDPGM 0
+  ;
+  ; FLATSCR-LABEL: name: non_entry_block_alloca
+  ; FLATSCR: bb.0.entry:
+  ; FLATSCR-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; FLATSCR-NEXT:   liveins: $sgpr4_sgpr5
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; FLATSCR-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; FLATSCR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; FLATSCR-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; FLATSCR-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; FLATSCR-NEXT:   S_BRANCH %bb.1
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.1.bb.0:
+  ; FLATSCR-NEXT:   successors: %bb.2(0x80000000)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xexec_hi = COPY $sp_reg
+  ; FLATSCR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
+  ; FLATSCR-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; FLATSCR-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.2.bb.1:
+  ; FLATSCR-NEXT:   S_ENDPGM 0
+    entry:
+    %cond = icmp eq i32 %arg.cond, 0
+    br i1 %cond, label %bb.0, label %bb.1
+
+    bb.0:
+    %alloca = alloca i32, addrspace(5)
+    store volatile i32 0, ptr addrspace(5) %alloca
+    br label %bb.1
+
+    bb.1:
+    ret void
+}
+
+define amdgpu_kernel void @kernel_non_entry_block_static_alloca_align512(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
+  ; MUBUF-LABEL: name: kernel_non_entry_block_static_alloca_align512
+  ; MUBUF: bb.0.entry:
+  ; MUBUF-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; MUBUF-NEXT:   liveins: $sgpr8_sgpr9
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; MUBUF-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; MUBUF-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; MUBUF-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; MUBUF-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.1.bb.0:
+  ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+  ; MUBUF-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; MUBUF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $private_rsrc_reg, [[COPY1]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_1]], $private_rsrc_reg, [[COPY1]], 4, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.2.bb.1:
+  ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
+  ; MUBUF-V5-LABEL: name: kernel_non_entry_block_static_alloca_align512
+  ; MUBUF-V5: bb.0.entry:
+  ; MUBUF-V5-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; MUBUF-V5-NEXT:   liveins: $sgpr8_sgpr9
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; MUBUF-V5-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-V5-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; MUBUF-V5-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; MUBUF-V5-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.1.bb.0:
+  ; MUBUF-V5-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+  ; MUBUF-V5-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $private_rsrc_reg, [[COPY1]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_1]], $private_rsrc_reg, [[COPY1]], 4, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.2.bb.1:
+  ; MUBUF-V5-NEXT:   S_ENDPGM 0
+  ;
+  ; FLATSCR-LABEL: name: kernel_non_entry_block_static_alloca_align512
+  ; FLATSCR: bb.0.entry:
+  ; FLATSCR-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; FLATSCR-NEXT:   liveins: $sgpr4_sgpr5
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; FLATSCR-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; FLATSCR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; FLATSCR-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; FLATSCR-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; FLATSCR-NEXT:   S_BRANCH %bb.1
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.1.bb.0:
+  ; FLATSCR-NEXT:   successors: %bb.2(0x80000000)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xexec_hi = COPY $sp_reg
+  ; FLATSCR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+  ; FLATSCR-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; FLATSCR-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+  ; FLATSCR-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_1]], killed [[S_ADD_I32_1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.2.bb.1:
+  ; FLATSCR-NEXT:   S_ENDPGM 0
+    entry:
+    %cond = icmp eq i32 %arg.cond, 0
+    br i1 %cond, label %bb.0, label %bb.1
+
+    bb.0:
+    %alloca = alloca [16 x i32], align 512, addrspace(5)
+    %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+    store volatile i32 0, ptr addrspace(5) %alloca
+    store volatile i32 1, ptr addrspace(5) %gep1
+    br label %bb.1
+
+    bb.1:
+    ret void
+}
+
+define amdgpu_kernel void @kernel_non_entry_block_multiple_static_alloca(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
+  ; MUBUF-LABEL: name: kernel_non_entry_block_multiple_static_alloca
+  ; MUBUF: bb.0.entry:
+  ; MUBUF-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; MUBUF-NEXT:   liveins: $sgpr8_sgpr9
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; MUBUF-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; MUBUF-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; MUBUF-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; MUBUF-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.1.bb.0:
+  ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+  ; MUBUF-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; MUBUF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 6144
+  ; MUBUF-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY2]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+  ; MUBUF-NEXT:   $sp_reg = COPY [[S_ADD_I32_1]]
+  ; MUBUF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $private_rsrc_reg, [[COPY1]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_1]], $private_rsrc_reg, [[COPY1]], 4, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_2]], $private_rsrc_reg, [[COPY2]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca2, addrspace 5)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.2.bb.1:
+  ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
+  ; MUBUF-V5-LABEL: name: kernel_non_entry_block_multiple_static_alloca
+  ; MUBUF-V5: bb.0.entry:
+  ; MUBUF-V5-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; MUBUF-V5-NEXT:   liveins: $sgpr8_sgpr9
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; MUBUF-V5-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-V5-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; MUBUF-V5-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; MUBUF-V5-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.1.bb.0:
+  ; MUBUF-V5-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+  ; MUBUF-V5-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sp_reg
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 6144
+  ; MUBUF-V5-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY2]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   $sp_reg = COPY [[S_ADD_I32_1]]
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $private_rsrc_reg, [[COPY1]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_1]], $private_rsrc_reg, [[COPY1]], 4, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_2]], $private_rsrc_reg, [[COPY2]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca2, addrspace 5)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.2.bb.1:
+  ; MUBUF-V5-NEXT:   S_ENDPGM 0
+  ;
+  ; FLATSCR-LABEL: name: kernel_non_entry_block_multiple_static_alloca
+  ; FLATSCR: bb.0.entry:
+  ; FLATSCR-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; FLATSCR-NEXT:   liveins: $sgpr4_sgpr5
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; FLATSCR-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg.cond.kernarg.offset, align 8, addrspace 4)
+  ; FLATSCR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; FLATSCR-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc
+  ; FLATSCR-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; FLATSCR-NEXT:   S_BRANCH %bb.1
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.1.bb.0:
+  ; FLATSCR-NEXT:   successors: %bb.2(0x80000000)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xexec_hi = COPY $sp_reg
+  ; FLATSCR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+  ; FLATSCR-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   $sp_reg = COPY [[S_ADD_I32_]]
+  ; FLATSCR-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[COPY2:%[0-9]+]]:sreg_32_xexec_hi = COPY $sp_reg
+  ; FLATSCR-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 6144
+  ; FLATSCR-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY2]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   $sp_reg = COPY [[S_ADD_I32_1]]
+  ; FLATSCR-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+  ; FLATSCR-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_3]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_1]], killed [[S_ADD_I32_2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_2]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.alloca2, addrspace 5)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.2.bb.1:
+  ; FLATSCR-NEXT:   S_ENDPGM 0
+    entry:
+    %cond = icmp eq i32 %arg.cond, 0
+    br i1 %cond, label %bb.0, label %bb.1
+
+    bb.0:
+    %alloca = alloca [16 x i32], addrspace(5)
+    %alloca2 = alloca i64, i32 12, align 1024, addrspace(5)
+    %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+    store volatile i32 0, ptr addrspace(5) %alloca
+    store volatile i32 1, ptr addrspace(5) %gep1
+    store volatile i32 2, ptr addrspace(5) %alloca2
+    br label %bb.1
+
+    bb.1:
+    ret void
+}
+
+define void @device_non_entry_block_static_alloca(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
+  ; MUBUF-LABEL: name: device_non_entry_block_static_alloca
+  ; MUBUF: bb.0.entry:
+  ; MUBUF-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; MUBUF-NEXT:   liveins: $vgpr2, $vgpr3
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; MUBUF-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; MUBUF-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
+  ; MUBUF-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; MUBUF-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.1.bb.0:
+  ; MUBUF-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr32
+  ; MUBUF-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3072
+  ; MUBUF-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY2]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-NEXT:   $sgpr32 = COPY [[S_ADD_I32_]]
+  ; MUBUF-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; MUBUF-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; MUBUF-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_2]], [[COPY3]], implicit $exec
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, [[COPY2]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFEN killed [[V_MOV_B32_e32_1]], killed [[V_LSHL_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT: bb.2.bb.1:
+  ; MUBUF-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; MUBUF-NEXT:   SI_RETURN
+  ;
+  ; MUBUF-V5-LABEL: name: device_non_entry_block_static_alloca
+  ; MUBUF-V5: bb.0.entry:
+  ; MUBUF-V5-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; MUBUF-V5-NEXT:   liveins: $vgpr2, $vgpr3
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; MUBUF-V5-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MUBUF-V5-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
+  ; MUBUF-V5-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; MUBUF-V5-NEXT:   S_BRANCH %bb.1
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.1.bb.0:
+  ; MUBUF-V5-NEXT:   successors: %bb.2(0x80000000)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr32
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3072
+  ; MUBUF-V5-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY2]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   $sgpr32 = COPY [[S_ADD_I32_]]
+  ; MUBUF-V5-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; MUBUF-V5-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; MUBUF-V5-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; MUBUF-V5-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_2]], [[COPY3]], implicit $exec
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, [[COPY2]], 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; MUBUF-V5-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; MUBUF-V5-NEXT:   BUFFER_STORE_DWORD_OFFEN killed [[V_MOV_B32_e32_1]], killed [[V_LSHL_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; MUBUF-V5-NEXT: {{  $}}
+  ; MUBUF-V5-NEXT: bb.2.bb.1:
+  ; MUBUF-V5-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; MUBUF-V5-NEXT:   SI_RETURN
+  ;
+  ; FLATSCR-LABEL: name: device_non_entry_block_static_alloca
+  ; FLATSCR: bb.0.entry:
+  ; FLATSCR-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; FLATSCR-NEXT:   liveins: $vgpr2, $vgpr3
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; FLATSCR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; FLATSCR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; FLATSCR-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], implicit $exec
+  ; FLATSCR-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; FLATSCR-NEXT:   S_BRANCH %bb.1
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.1.bb.0:
+  ; FLATSCR-NEXT:   successors: %bb.2(0x80000000)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[COPY2:%[0-9]+]]:sreg_32_xexec_hi = COPY $sgpr32
+  ; FLATSCR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3072
+  ; FLATSCR-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY2]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+  ; FLATSCR-NEXT:   $sgpr32 = COPY [[S_ADD_I32_]]
+  ; FLATSCR-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc
+  ; FLATSCR-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; FLATSCR-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+  ; FLATSCR-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_2]], [[COPY3]], implicit $exec
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.alloca, addrspace 5)
+  ; FLATSCR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD killed [[V_MOV_B32_e32_1]], killed [[V_LSHL_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.gep1, addrspace 5)
+  ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT: bb.2.bb.1:
+  ; FLATSCR-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; FLATSCR-NEXT:   SI_RETURN
+    entry:
+    %cond = icmp eq i32 %arg.cond, 0
+    br i1 %cond, label %bb.0, label %bb.1
+
+    bb.0:
+    %alloca = alloca i32, i32 10, align 512, addrspace(5)
+    %gep1 = getelementptr i32, ptr addrspace(5) %alloca, i32 %in
+    store volatile i32 0, ptr addrspace(5) %alloca
+    store volatile i32 1, ptr addrspace(5) %gep1
+    br label %bb.1
+
+    bb.1:
+    ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 85096eb63f46e1..0477d55e9baa36 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_cmp_lg_u32 s9, 0
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
 ; MUBUF-NEXT:  ; %bb.2: ; %bb.1
-; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
-; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
-; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
+; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
+; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s6
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
 ; MUBUF-NEXT:    s_add_i32 s6, s6, s7
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
 ; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
 ; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_cmp_lg_u32 s4, 0
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_2
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
-; MUBUF-NEXT:    s_add_i32 s4, s32, 0x1000
-; MUBUF-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT:    s_lshl_b32 s5, s5, 2
-; MUBUF-NEXT:    s_mov_b32 s32, s4
+; MUBUF-NEXT:    s_mov_b32 s4, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
-; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
+; MUBUF-NEXT:    s_lshl_b32 s5, s5, 2
+; MUBUF-NEXT:    s_add_i32 s32, s4, 0x1000
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s4 offset:4
 ; MUBUF-NEXT:    s_add_i32 s4, s4, s5
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s0, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; FLATSCR-NEXT:    s_mov_b32 s0, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
 ; FLATSCR-NEXT:    s_lshl_b32 s1, s1, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s0
+; FLATSCR-NEXT:    s_add_i32 s32, s0, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    s_add_i32 s0, s0, s1
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; MUBUF-NEXT:    s_and_b64 exec, exec, vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
 ; MUBUF-NEXT:  ; %bb.2: ; %bb.1
-; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v3, s6
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
 ; MUBUF-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
 ; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v3, 1
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
 ; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB3_2
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
-; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v4, s6
-; MUBUF-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
-; MUBUF-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
 ; MUBUF-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
 ; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB3_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
-; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v5, 1
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
 ; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off



More information about the llvm-commits mailing list