[llvm] [AMDGPU] Update base addr of dyn alloca considering GrowingUp stack (PR #119822)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 12 22:34:00 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Currently, compiler calculates the base address of
dynamic sized stack object (alloca) by bumping SP
by required size and then aligns bumped SP:
1. `NewSP = Align(CurrSP + Size)`
where `Size = # of elements * wave size * alloca type`
2. `BaseAddr = NewSP`
3. The alignment is computed as: `AlignedAddr = Addr & ~(Alignment - 1)`
4. Return the `BaseAddr`, which makes sense when stack is growing downwards.
For AMDGPU which uses upward growing stack,
the base address has to be aligned first and SP
needs to be bumped later by required size:
1. `BaseAddr = Align(CurrSP)`
2. `NewSP = BaseAddr + Size`
3. `AlignedAddr = (Addr + (Alignment - 1)) & ~(Alignment - 1)`
4. and returns the `BaseAddr`.
---
Full diff: https://github.com/llvm/llvm-project/pull/119822.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+17-14)
- (modified) llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll (+28-36)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fc8bbb154d035d..bee66f0f7fa2db 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4002,8 +4002,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InVals, /*IsThisReturn=*/false, SDValue());
}
-// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for applying the wave size scale to the increment amount.
+// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for stack growth direction(default: downwards, AMDGPU: upwards) and
+// applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
@@ -4023,10 +4024,20 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
- SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Chain = SP.getValue(1);
+ SDValue SPOld = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SPOld.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
+ Align StackAlign = TFL->getStackAlign();
+ if (Alignment && *Alignment > StackAlign) {
+ SDValue ScaledAlignment = DAG.getSignedConstant(
+ (uint64_t)Alignment->value() << Subtarget->getWavefrontSizeLog2(), dl,
+ VT);
+ SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, ScaledAlignment,
+ DAG.getConstant(1, dl, VT));
+ Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask);
+ Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, ScaledAlignment);
+ }
unsigned Opc =
TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp
? ISD::ADD
@@ -4036,20 +4047,12 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
- Align StackAlign = TFL->getStackAlign();
- Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
- if (Alignment && *Alignment > StackAlign) {
- Tmp1 = DAG.getNode(
- ISD::AND, dl, VT, Tmp1,
- DAG.getSignedConstant(-(uint64_t)Alignment->value()
- << Subtarget->getWavefrontSizeLog2(),
- dl, VT));
- }
+ Tmp1 = DAG.getNode(Opc, dl, VT, SPOld, ScaledSize); // Value
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
- return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+ return DAG.getMergeValues({SPOld, Tmp2}, dl);
}
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 85096eb63f46e1..0477d55e9baa36 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s6
-; MUBUF-NEXT: v_mov_b32_e32 v3, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: s_add_i32 s6, s6, s7
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000
-; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
-; MUBUF-NEXT: s_mov_b32 s32, s4
+; MUBUF-NEXT: s_mov_b32 s4, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s4
-; MUBUF-NEXT: v_mov_b32_e32 v3, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
+; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
; MUBUF-NEXT: s_add_i32 s4, s4, s5
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
+; FLATSCR-NEXT: s_mov_b32 s0, s32
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s0
+; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
; FLATSCR-NEXT: s_add_i32 s0, s0, s1
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; MUBUF-NEXT: s_and_b64 exec, exec, vcc
; MUBUF-NEXT: s_cbranch_execz .LBB2_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v3, s6
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v4, s6
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
-; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
``````````
</details>
https://github.com/llvm/llvm-project/pull/119822
More information about the llvm-commits
mailing list