[llvm] [AMDGPU] Optimizing Dynamic Alloca S-DAG I-Sel (PR #124292)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 24 07:37:45 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Aaditya (easyonaadit)

<details>
<summary>Changes</summary>

To get the allocation size for dynamic allocas,
first we scale the `max_size` by the `wave_size`,
and then bump SP by this amount.
```
base_addr = sp
dyn_size = max_size << wave_size
sp = base_addr + dyn_size
```
Currently, during `DAG->DAG ISel`, the scalar
left-shift and addition operations are being
folded into a `v_lshl_add_u32` instruction,
and then `readfirstlane` is used to move the
value into SP.

This patch enforces the use of scalar operations,
removing the need for a `readfirstlane` instr.
This makes the code-gen for S-DAG and G-ISEL
uniform.

---

Patch is 36.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124292.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-8) 
- (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+183-190) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1aeca7f370aa1b..6427e4e9f6a2ec 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4088,15 +4088,12 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
         DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
     Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
                        Size, DAG.getConstant(0, dl, MVT::i32));
-    SDValue ScaledSize = DAG.getNode(
-        ISD::SHL, dl, VT, Size,
+    SDNode *ScaledSize = DAG.getMachineNode(
+        AMDGPU::S_LSHL_B32, dl, VT, Size,
         DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
-    NewSP =
-        DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
-    SDValue ReadFirstLaneID =
-        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
-    NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
-                        NewSP);
+    NewSP = {DAG.getMachineNode(AMDGPU::S_ADD_I32, dl, VT, BaseAddr,
+                                SDValue(ScaledSize, 0)),
+             0};
   }
 
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0b..edcf4c29102c47 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -269,12 +269,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -325,12 +324,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
@@ -386,13 +386,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -446,11 +446,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s0, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -507,12 +507,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -561,12 +560,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
@@ -609,7 +609,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
@@ -621,29 +621,28 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT:    s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s8, s5
 ; GFX9-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s5, s[6:7]
 ; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s5
 ; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s5
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_max_u32 s9, s9, s10
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
 ; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 3
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s9, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 4
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s6, s5
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
@@ -749,19 +748,18 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-SDAG-NEXT:    s_lshl_b32 s2, s2, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s32
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s3
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s3 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s3, s2
 ; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
@@ -866,11 +864,10 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s4, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s6, s4
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
@@ -967,12 +964,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s2, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
 ; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s1, 2
@@ -1077,12 +1075,11 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
@@ -1137,12 +1134,13 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1204,13 +1202,13 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
@@ -1274,13 +1272,13 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
 ; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1342,12 +1340,11 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
@@ -1402,12 +1399,13 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 22
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1467,12 +1465,11 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
@@ -1530,12 +1527,13 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1603,11 +1601,11 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s7, 6
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s6, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
@@ -1675,14 +1673,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 5
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
-; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s0, s1
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/124292


More information about the llvm-commits mailing list