[llvm] [AMDGPU] Update base addr of dyn alloca considering GrowingUp stack (PR #119822)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 18 22:40:49 PST 2024
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/119822
>From 2d06d3a75f96f8f923446ac1ee5d1f61832a4429 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sat, 7 Dec 2024 16:27:59 +0530
Subject: [PATCH 1/3] fixing base address of dynamically sized stack object for
growing up stack
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 +++++--
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 64 +++++++++-----------
2 files changed, 45 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7da93f90341d22..7fb4eb4ab27213 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4016,8 +4016,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InVals, /*IsThisReturn=*/false, SDValue());
}
-// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for applying the wave size scale to the increment amount.
+// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for stack growth direction(default: downwards, AMDGPU: upwards) and
+// applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
@@ -4037,19 +4038,29 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
- SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Chain = SP.getValue(1);
+ SDValue SPOld = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SPOld.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
"Stack grows upwards for AMDGPU");
+ Align StackAlign = TFL->getStackAlign();
+ if (Alignment && *Alignment > StackAlign) {
+ SDValue ScaledAlignment = DAG.getSignedConstant(
+ (uint64_t)Alignment->value() << Subtarget->getWavefrontSizeLog2(), dl,
+ VT);
+ SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, ScaledAlignment,
+ DAG.getConstant(1, dl, VT));
+ Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask);
+ Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, ScaledAlignment);
+ }
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
Align StackAlign = TFL->getStackAlign();
- Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
+ Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, ScaledSize); // Value
if (Alignment && *Alignment > StackAlign) {
Tmp1 = DAG.getNode(
ISD::AND, dl, VT, Tmp1,
@@ -4061,7 +4072,7 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
- return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+ return DAG.getMergeValues({SPOld, Tmp2}, dl);
}
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 85096eb63f46e1..0477d55e9baa36 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s6
-; MUBUF-NEXT: v_mov_b32_e32 v3, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: s_add_i32 s6, s6, s7
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000
-; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
-; MUBUF-NEXT: s_mov_b32 s32, s4
+; MUBUF-NEXT: s_mov_b32 s4, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s4
-; MUBUF-NEXT: v_mov_b32_e32 v3, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
+; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
; MUBUF-NEXT: s_add_i32 s4, s4, s5
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
+; FLATSCR-NEXT: s_mov_b32 s0, s32
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s0
+; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
; FLATSCR-NEXT: s_add_i32 s0, s0, s1
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; MUBUF-NEXT: s_and_b64 exec, exec, vcc
; MUBUF-NEXT: s_cbranch_execz .LBB2_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v3, s6
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v4, s6
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
-; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
>From a7ca93ace27abd1bd75eee450ad598c29ba59525 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Fri, 13 Dec 2024 15:47:57 +0530
Subject: [PATCH 2/3] GlobalIsel code change, review comments
---
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 13 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 38 ++-
.../GlobalISel/dynamic-alloca-uniform.ll | 219 ++++++++++--------
.../AMDGPU/GlobalISel/non-entry-alloca.ll | 26 ++-
.../regbankselect-dyn-stackalloc.mir | 180 +++++++++-----
llvm/test/CodeGen/AMDGPU/amdpal-callable.ll | 6 +-
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 26 ++-
7 files changed, 301 insertions(+), 207 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c05f079516ba68..d94c400ad14225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1204,15 +1204,18 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
- auto SPCopy = B.buildCopy(PtrTy, SPReg);
+ auto OldSP = B.buildCopy(PtrTy, SPReg);
if (Alignment > TFI.getStackAlign()) {
- auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
- B.buildMaskLowPtrBits(Dst, PtrAdd,
+ auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
+ auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
+ B.buildConstant(LLT::scalar(32), StackAlignMask));
+ B.buildMaskLowPtrBits(Dst, Tmp1,
Log2(Alignment) + ST.getWavefrontSizeLog2());
} else {
- B.buildPtrAdd(Dst, SPCopy, ScaledSize);
+ B.buildCopy(Dst, OldSP);
}
-
+ auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
+ B.buildCopy(SPReg, PtrAdd);
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7fb4eb4ab27213..0eba71077db6bf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4038,41 +4038,35 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
- SDValue SPOld = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Chain = SPOld.getValue(1);
- MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
+ SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
"Stack grows upwards for AMDGPU");
+
+ Chain = BaseAddr.getValue(1);
Align StackAlign = TFL->getStackAlign();
- if (Alignment && *Alignment > StackAlign) {
- SDValue ScaledAlignment = DAG.getSignedConstant(
- (uint64_t)Alignment->value() << Subtarget->getWavefrontSizeLog2(), dl,
- VT);
- SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, ScaledAlignment,
- DAG.getConstant(1, dl, VT));
- Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask);
- Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, ScaledAlignment);
+ if (Alignment > StackAlign) {
+ auto ScaledAlignment = (uint64_t)Alignment.value()
+ << Subtarget->getWavefrontSizeLog2();
+ auto StackAlignMask = ScaledAlignment - 1;
+ auto TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
+ DAG.getConstant(StackAlignMask, dl, VT));
+ BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
+ DAG.getConstant(ScaledAlignment, dl, VT));
}
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
- Align StackAlign = TFL->getStackAlign();
- Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, ScaledSize); // Value
- if (Alignment && *Alignment > StackAlign) {
- Tmp1 = DAG.getNode(
- ISD::AND, dl, VT, Tmp1,
- DAG.getSignedConstant(-(uint64_t)Alignment->value()
- << Subtarget->getWavefrontSizeLog2(),
- dl, VT));
- }
+ auto NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
- Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
- return DAG.getMergeValues({SPOld, Tmp2}, dl);
+ return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
}
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 741323a201d02e..ae055ea041297e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -8,52 +8,55 @@
define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_movk_i32 s32, 0x400
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_mov_b32 s4, s32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
-; GFX9-NEXT: s_and_b32 s4, s4, -16
-; GFX9-NEXT: s_lshl_b32 s4, s4, 6
-; GFX9-NEXT: s_add_u32 s4, s32, s4
+; GFX9-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX9-NEXT: s_and_b32 s5, s5, -16
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_lshl_b32 s5, s5, 6
; GFX9-NEXT: s_mov_b32 s33, 0
+; GFX9-NEXT: s_add_u32 s32, s4, s5
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: s_add_u32 s0, s0, s17
+; GFX10-NEXT: s_mov_b32 s4, s32
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b32 s33, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
-; GFX10-NEXT: s_and_b32 s4, s4, -16
-; GFX10-NEXT: s_lshl_b32 s4, s4, 5
-; GFX10-NEXT: s_add_u32 s4, s32, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX10-NEXT: s_and_b32 s5, s5, -16
+; GFX10-NEXT: s_lshl_b32 s5, s5, 5
+; GFX10-NEXT: s_add_u32 s32, s4, s5
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b32 s32, 16
; GFX11-NEXT: s_mov_b32 s33, 0
+; GFX11-NEXT: s_mov_b32 s0, s32
+; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
+; GFX11-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, s0, -16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-NEXT: s_and_b32 s1, s1, -16
+; GFX11-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s0, s32, s0
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: s_add_u32 s32, s0, s1
; GFX11-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 4, addrspace(5)
store i32 0, ptr addrspace(5) %alloca
@@ -64,24 +67,25 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s7, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s6, s32
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s33, s7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
-; GFX9-NEXT: s_add_u32 s4, s32, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_add_u32 s32, s6, s4
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -89,31 +93,32 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s7, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s6, s32
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-NEXT: s_mov_b32 s33, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
-; GFX10-NEXT: s_add_u32 s4, s32, s4
+; GFX10-NEXT: s_add_u32 s32, s6, s4
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s3, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -121,7 +126,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_mov_b32 s2, s32
+; GFX11-NEXT: s_mov_b32 s33, s3
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -129,10 +136,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s0, s32, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s32, s2, s0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv, align 4
%alloca = alloca i32, i32 %n, addrspace(5)
@@ -143,52 +149,55 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_movk_i32 s32, 0x400
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_mov_b32 s4, s32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
-; GFX9-NEXT: s_and_b32 s4, s4, -16
-; GFX9-NEXT: s_lshl_b32 s4, s4, 6
-; GFX9-NEXT: s_add_u32 s4, s32, s4
+; GFX9-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX9-NEXT: s_and_b32 s5, s5, -16
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_lshl_b32 s5, s5, 6
; GFX9-NEXT: s_mov_b32 s33, 0
+; GFX9-NEXT: s_add_u32 s32, s4, s5
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: s_add_u32 s0, s0, s17
+; GFX10-NEXT: s_mov_b32 s4, s32
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b32 s33, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
-; GFX10-NEXT: s_and_b32 s4, s4, -16
-; GFX10-NEXT: s_lshl_b32 s4, s4, 5
-; GFX10-NEXT: s_add_u32 s4, s32, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX10-NEXT: s_and_b32 s5, s5, -16
+; GFX10-NEXT: s_lshl_b32 s5, s5, 5
+; GFX10-NEXT: s_add_u32 s32, s4, s5
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b32 s32, 16
; GFX11-NEXT: s_mov_b32 s33, 0
+; GFX11-NEXT: s_mov_b32 s0, s32
+; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
+; GFX11-NEXT: s_lshl2_add_u32 s1, s1, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, s0, -16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-NEXT: s_and_b32 s1, s1, -16
+; GFX11-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s0, s32, s0
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: s_add_u32 s32, s0, s1
; GFX11-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 16, addrspace(5)
store i32 0, ptr addrspace(5) %alloca
@@ -199,24 +208,25 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s7, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s6, s32
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s33, s7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
-; GFX9-NEXT: s_add_u32 s4, s32, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_add_u32 s32, s6, s4
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -224,31 +234,32 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s7, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s6, s32
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s6
+; GFX10-NEXT: s_mov_b32 s33, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
-; GFX10-NEXT: s_add_u32 s4, s32, s4
+; GFX10-NEXT: s_add_u32 s32, s6, s4
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s3, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
@@ -256,7 +267,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: s_mov_b32 s33, s2
+; GFX11-NEXT: s_mov_b32 s2, s32
+; GFX11-NEXT: s_mov_b32 s33, s3
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -264,10 +277,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s0, s32, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s32, s2, s0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv, align 16
%alloca = alloca i32, i32 %n, addrspace(5)
@@ -279,37 +291,39 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_add_u32 s0, s0, s17
; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
+; GFX9-NEXT: s_and_b32 s5, s5, 0xfffff800
; GFX9-NEXT: s_and_b32 s4, s4, -16
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
-; GFX9-NEXT: s_add_u32 s4, s32, s4
-; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_mov_b32 s33, 0
+; GFX9-NEXT: s_add_u32 s32, s5, s4
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-NEXT: s_movk_i32 s32, 0x400
; GFX10-NEXT: s_add_u32 s0, s0, s17
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_movk_i32 s32, 0x400
+; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff
; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_and_b32 s5, s5, 0xfffffc00
; GFX10-NEXT: s_mov_b32 s33, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
-; GFX10-NEXT: s_add_u32 s4, s32, s4
-; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_add_u32 s32, s5, s4
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
@@ -317,16 +331,17 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-NEXT: s_mov_b32 s32, 32
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff
; GFX11-NEXT: s_mov_b32 s33, 0
+; GFX11-NEXT: s_and_b32 s1, s1, 0xfffffc00
+; GFX11-NEXT: scratch_store_b32 off, v0, s1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s0, s32, s0
-; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s32, s1, s0
; GFX11-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 32, addrspace(5)
store i32 0, ptr addrspace(5) %alloca
@@ -349,14 +364,15 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff
+; GFX9-NEXT: s_and_b32 s5, s5, 0xfffff800
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
-; GFX9-NEXT: s_add_u32 s4, s32, s4
-; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_add_u32 s32, s5, s4
; GFX9-NEXT: s_addk_i32 s32, 0xf000
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -376,15 +392,16 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX10-NEXT: s_mov_b32 s33, s6
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff
+; GFX10-NEXT: s_and_b32 s5, s5, 0xfffffc00
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
-; GFX10-NEXT: s_add_u32 s4, s32, s4
+; GFX10-NEXT: s_add_u32 s32, s5, s4
; GFX10-NEXT: s_addk_i32 s32, 0xf800
-; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
@@ -402,16 +419,18 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, s1, 0xfffffc00
+; GFX11-NEXT: scratch_store_b32 off, v0, s1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, -16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-NEXT: s_add_u32 s32, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s0, s32, s0
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
-; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%n = load i32, ptr addrspace(4) @gv
%alloca = alloca i32, i32 %n, align 32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 01287d5b7cf247..69abef02d3d924 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -27,19 +27,20 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.2: ; %bb.1
-; GCN-NEXT: s_load_dword s5, s[8:9], 0x10
-; GCN-NEXT: s_add_u32 s4, s32, 0x1000
+; GCN-NEXT: s_load_dword s4, s[8:9], 0x10
+; GCN-NEXT: s_mov_b32 s6, s32
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s5, s5, 2
-; GCN-NEXT: s_add_u32 s4, s4, s5
+; GCN-NEXT: s_lshl_b32 s4, s4, 2
+; GCN-NEXT: s_add_u32 s4, s6, s4
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_u32 s32, s6, 0x1000
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -94,19 +95,20 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_load_dword s4, s[8:9], 0xc
-; GCN-NEXT: s_add_u32 s5, s32, 0x1000
-; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000
+; GCN-NEXT: s_add_u32 s5, s32, 0xfff
+; GCN-NEXT: s_and_b32 s6, s5, 0xfffff000
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s4, s4, 2
; GCN-NEXT: v_mov_b32_e32 v3, 1
-; GCN-NEXT: s_add_u32 s4, s5, s4
+; GCN-NEXT: s_add_u32 s4, s6, s4
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_u32 s32, s6, 0x1000
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -159,7 +161,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; GCN-NEXT: s_and_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execz .LBB2_3
; GCN-NEXT: ; %bb.2: ; %bb.1
-; GCN-NEXT: s_add_u32 s6, s32, 0x1000
+; GCN-NEXT: s_mov_b32 s6, s32
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
@@ -169,6 +171,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; GCN-NEXT: s_add_u32 s32, s6, 0x1000
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: global_store_dword v[0:1], v2, off
@@ -219,7 +222,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB3_2
; GCN-NEXT: ; %bb.1: ; %bb.0
-; GCN-NEXT: s_add_u32 s6, s32, 0x1000
+; GCN-NEXT: s_add_u32 s6, s32, 0xfff
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v4, s6
@@ -230,6 +233,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
+; GCN-NEXT: s_add_u32 s32, s6, 0x1000
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: global_store_dword v[0:1], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
index ed1ca320943de7..5378ce2d1efaad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
@@ -23,8 +23,11 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align1
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -32,8 +35,10 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 1
S_ENDPGM 0, implicit %1
@@ -57,8 +62,11 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align2
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -66,8 +74,10 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 2
S_ENDPGM 0, implicit %1
@@ -91,8 +101,11 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align4
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -100,8 +113,10 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 4
S_ENDPGM 0, implicit %1
@@ -125,8 +140,11 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align8
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -134,8 +152,10 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 8
S_ENDPGM 0, implicit %1
@@ -159,8 +179,11 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align16
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -168,8 +191,10 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 16
S_ENDPGM 0, implicit %1
@@ -193,10 +218,14 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
- ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+ ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align32
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -204,9 +233,12 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
- ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1023
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
+ ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 32
@@ -231,10 +263,14 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
- ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+ ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align64
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -242,9 +278,12 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
- ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+ ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 64
@@ -269,10 +308,14 @@ body: |
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192
- ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8191
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192
+ ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align128
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -280,9 +323,12 @@ body: |
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
- ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
- ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+ ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
%0:_(s32) = COPY $sgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 128
@@ -304,15 +350,20 @@ body: |
; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align4
; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32
; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
%0:_(s32) = G_CONSTANT i32 32
%1:_(p5) = G_DYN_STACKALLOC %0, 4
S_ENDPGM 0, implicit %1
@@ -336,8 +387,11 @@ body: |
; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align8
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -345,8 +399,10 @@ body: |
; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
%0:_(s32) = G_CONSTANT i32 32
%1:_(p5) = G_DYN_STACKALLOC %0, 8
S_ENDPGM 0, implicit %1
@@ -370,8 +426,11 @@ body: |
; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align16
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -379,8 +438,10 @@ body: |
; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
%0:_(s32) = G_CONSTANT i32 32
%1:_(p5) = G_DYN_STACKALLOC %0, 16
S_ENDPGM 0, implicit %1
@@ -404,10 +465,14 @@ body: |
; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
- ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+ ; WAVE64-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+ ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C3]](s32)
+ ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+ ;
; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align32
; WAVE32: liveins: $sgpr0
; WAVE32-NEXT: {{ $}}
@@ -415,9 +480,12 @@ body: |
; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
- ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
- ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
- ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1023
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+ ; WAVE32-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
+ ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C3]](s32)
+ ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
%0:_(s32) = G_CONSTANT i32 32
%1:_(p5) = G_DYN_STACKALLOC %0, 32
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 1a0fda3d54d3f4..a5f915c48ebeea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -143,7 +143,7 @@ attributes #0 = { nounwind }
; GCN: amdpal.pipelines:
; GCN-NEXT: - .registers:
; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
-; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}}
+; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
; GCN-NEXT: .shader_functions:
; GCN-NEXT: dynamic_stack:
@@ -157,10 +157,10 @@ attributes #0 = { nounwind }
; GCN-NEXT: .backend_stack_size: 0x10{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
; SDAG-NEXT: .sgpr_count: 0x25{{$}}
-; GISEL-NEXT: .sgpr_count: 0x27{{$}}
+; GISEL-NEXT: .sgpr_count: 0x26{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
; SDAG-NEXT: .vgpr_count: 0x3{{$}}
-; GISEL-NEXT: .vgpr_count: 0x5{{$}}
+; GISEL-NEXT: .vgpr_count: 0x4{{$}}
; GCN-NEXT: multiple_stack:
; GCN-NEXT: .backend_stack_size: 0x24{{$}}
; GCN-NEXT: .lds_size: 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 0477d55e9baa36..4a1956e309a515 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -130,14 +130,16 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_mov_b32 s4, s32
-; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff
+; MUBUF-NEXT: s_and_b32 s4, s4, 0x1000
; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
-; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
-; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0
+; MUBUF-NEXT: v_mov_b32_e32 v2, s4
+; MUBUF-NEXT: v_mov_b32_e32 v3, 1
; MUBUF-NEXT: s_add_i32 s4, s4, s5
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -162,8 +164,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
+; FLATSCR-NEXT: s_add_i32 s0, s32, 0xfff
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: s_mov_b32 s0, s32
+; FLATSCR-NEXT: s_and_b32 s0, s0, 0x1000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
@@ -319,11 +322,13 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_mov_b32 s6, s32
+; MUBUF-NEXT: s_add_i32 s6, s32, 0xfff
+; MUBUF-NEXT: s_and_b32 s6, s6, 0x1000
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
+; MUBUF-NEXT: v_mov_b32_e32 v4, s6
+; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
+; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
@@ -351,7 +356,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_mov_b32 s2, s32
+; FLATSCR-NEXT: s_add_i32 s2, s32, 0xfff
+; FLATSCR-NEXT: s_and_b32 s2, s2, 0x1000
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
>From 55f293d519c73363828b43307a76a627d2a670da Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 19 Dec 2024 12:05:39 +0530
Subject: [PATCH 3/3] Review comments pt.2
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 +++++++-------
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 8 ++++----
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0eba71077db6bf..679c264ff384cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4048,20 +4048,20 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = BaseAddr.getValue(1);
Align StackAlign = TFL->getStackAlign();
if (Alignment > StackAlign) {
- auto ScaledAlignment = (uint64_t)Alignment.value()
- << Subtarget->getWavefrontSizeLog2();
- auto StackAlignMask = ScaledAlignment - 1;
- auto TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
- DAG.getConstant(StackAlignMask, dl, VT));
+ uint64_t ScaledAlignment = (uint64_t)Alignment.value()
+ << Subtarget->getWavefrontSizeLog2();
+ uint64_t StackAlignMask = ScaledAlignment - 1;
+ SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
+ DAG.getConstant(StackAlignMask, dl, VT));
BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
- DAG.getConstant(ScaledAlignment, dl, VT));
+ DAG.getSignedConstant(-ScaledAlignment, dl, VT));
}
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
- auto NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+ SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 4a1956e309a515..2bd60e869f843a 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -131,7 +131,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff
-; MUBUF-NEXT: s_and_b32 s4, s4, 0x1000
+; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
@@ -166,7 +166,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_add_i32 s0, s32, 0xfff
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: s_and_b32 s0, s0, 0x1000
+; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
@@ -323,7 +323,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
; MUBUF-NEXT: s_add_i32 s6, s32, 0xfff
-; MUBUF-NEXT: s_and_b32 s6, s6, 0x1000
+; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
; MUBUF-NEXT: v_mov_b32_e32 v4, s6
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
@@ -357,7 +357,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_add_i32 s2, s32, 0xfff
-; FLATSCR-NEXT: s_and_b32 s2, s2, 0x1000
+; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
More information about the llvm-commits
mailing list