[llvm] [AMDGPU] Refactored code for handling dynamic allocas with growing up stack (PR #119168)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 9 00:25:43 PST 2024
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/119168
>From 19e4f79a094b1a7ce44970a506287d2137205801 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sat, 7 Dec 2024 16:27:59 +0530
Subject: [PATCH 1/2] changes to old code to refactor them, alignment, zero
sized allocas, returning correct start location
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +++++++----
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 64 +++++++++-----------
2 files changed, 52 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fc8bbb154d035d..92f8a8f8099967 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4023,10 +4023,25 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
- SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Chain = SP.getValue(1);
+ // Start address of the dynamically sized stack object
+ SDValue SPOld = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SPOld.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
+ // First we need to align the start address of the stack object to the required alignment.
+ Align StackAlign = TFL->getStackAlign();
+ if (Alignment && *Alignment > StackAlign) {
+ // formula for aligning address `SPold` to alignment boundry `align` => alignedSP = (SPold + (align - 1)) & ~(align - 1)
+ SDValue AlignedValue = DAG.getConstant(Alignment->value(), dl, VT); // the alignment boundry we want to align to
+ SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, AlignedValue, // StackAlignMask = (align - 1)
+ DAG.getConstant(1, dl, VT));
+ Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask); // Tmp1 = (SPold + (align - 1))
+ Tmp1 = DAG.getNode( // Tmp1 now holds the start address aligned to the required value
+ ISD::AND, dl, VT, Tmp1,
+ DAG.getSignedConstant(-(uint64_t)Alignment->value()
+ << Subtarget->getWavefrontSizeLog2(),
+ dl, VT));
+ }
unsigned Opc =
TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp
? ISD::ADD
@@ -4035,19 +4050,17 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+ // incase the value in %n at runtime is 0, we need to handle that case. There should not be a 0 sized stack object.
+ ScaledSize = DAG.getNode( // size = max(size, 0)
+ ISD::UMAX, dl, VT, ScaledSize,
+ DAG.getConstant(1, dl, VT));
- Align StackAlign = TFL->getStackAlign();
- Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
- if (Alignment && *Alignment > StackAlign) {
- Tmp1 = DAG.getNode(
- ISD::AND, dl, VT, Tmp1,
- DAG.getSignedConstant(-(uint64_t)Alignment->value()
- << Subtarget->getWavefrontSizeLog2(),
- dl, VT));
- }
+ Tmp1 = DAG.getNode(Opc, dl, VT, SPOld, ScaledSize); // Value
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+ // Set Tmp1 to point to the start address of this stack object.
+ Tmp1 = SPOld;
return DAG.getMergeValues({Tmp1, Tmp2}, dl);
}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 85096eb63f46e1..0477d55e9baa36 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s6
-; MUBUF-NEXT: v_mov_b32_e32 v3, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: s_add_i32 s6, s6, s7
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000
-; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
-; MUBUF-NEXT: s_mov_b32 s32, s4
+; MUBUF-NEXT: s_mov_b32 s4, s32
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s4
-; MUBUF-NEXT: v_mov_b32_e32 v3, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, 1
+; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
+; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
; MUBUF-NEXT: s_add_i32 s4, s4, s5
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
+; FLATSCR-NEXT: s_mov_b32 s0, s32
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
-; FLATSCR-NEXT: s_mov_b32 s32, s0
+; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
; FLATSCR-NEXT: s_add_i32 s0, s0, s1
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; MUBUF-NEXT: s_and_b64 exec, exec, vcc
; MUBUF-NEXT: s_cbranch_execz .LBB2_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v3, s6
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
; MUBUF-NEXT: ; %bb.1: ; %bb.0
-; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
+; MUBUF-NEXT: s_mov_b32 s6, s32
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
-; MUBUF-NEXT: v_mov_b32_e32 v4, s6
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
-; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT: s_mov_b32 s32, s6
+; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
-; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
-; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
+; FLATSCR-NEXT: s_mov_b32 s2, s32
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT: s_mov_b32 s32, s2
+; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
>From 969399068837617e642ff954f7b7c7900ba5e5d9 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 9 Dec 2024 13:54:33 +0530
Subject: [PATCH 2/2] updated code
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 92f8a8f8099967..8e490c40e5a175 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4002,8 +4002,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InVals, /*IsThisReturn=*/false, SDValue());
}
-// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for applying the wave size scale to the increment amount.
+// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for considering a growing up stack and applying the wave size scale
+// to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
@@ -4032,15 +4033,15 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Align StackAlign = TFL->getStackAlign();
if (Alignment && *Alignment > StackAlign) {
// formula for aligning address `SPold` to alignment boundry `align` => alignedSP = (SPold + (align - 1)) & ~(align - 1)
- SDValue AlignedValue = DAG.getConstant(Alignment->value(), dl, VT); // the alignment boundry we want to align to
- SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, AlignedValue, // StackAlignMask = (align - 1)
+ SDValue ScaledAlignment = DAG.getSignedConstant((uint64_t)Alignment->value()
+ << Subtarget->getWavefrontSizeLog2(),
+ dl, VT);
+ // SDValue AlignedValue = DAG.getConstant(Alignment->value(), dl, VT); // the alignment boundry we want to align to
+ SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, ScaledAlignment, // StackAlignMask = (align - 1)
DAG.getConstant(1, dl, VT));
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask); // Tmp1 = (SPold + (align - 1))
Tmp1 = DAG.getNode( // Tmp1 now holds the start address aligned to the required value
- ISD::AND, dl, VT, Tmp1,
- DAG.getSignedConstant(-(uint64_t)Alignment->value()
- << Subtarget->getWavefrontSizeLog2(),
- dl, VT));
+ ISD::AND, dl, VT, Tmp1, ScaledAlignment);
}
unsigned Opc =
TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp
@@ -4051,7 +4052,7 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
// incase the value in %n at runtime is 0, we need to handle that case. There should not be a 0 sized stack object.
- ScaledSize = DAG.getNode( // size = max(size, 0)
+ ScaledSize = DAG.getNode( // size = max(size, 1)
ISD::UMAX, dl, VT, ScaledSize,
DAG.getConstant(1, dl, VT));
@@ -4059,10 +4060,8 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
- // Set Tmp1 to point to the start address of this stack object.
- Tmp1 = SPOld;
- return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+ return DAG.getMergeValues({SPOld, Tmp2}, dl); // return start address of the stack object
}
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
More information about the llvm-commits
mailing list