[llvm] [AMDGPU] Optimizing Dynamic Alloca S-DAG I-Sel (PR #124292)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 07:37:09 PST 2025
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/124292
To get the allocation size for dynamic allocas,
first we scale the `max_size` by the `wave_size`,
and then bump SP by this amount.
```
base_addr = sp
dyn_size = max_size << wave_size
sp = base_addr + dyn_size
```
Currently, during `DAG->DAG ISel`, the scalar
left-shift and addition operations are being
folded into a `v_lshl_add_u32` instruction,
and then `readfirstlane` is used to move the
value into SP.
This patch enforces the use of scalar operations,
removing the need for a `readfirstlane` instr.
This makes the code-gen for S-DAG and G-ISEL
uniform.
>From 6411df966a4308b103133e0dd5d23cbbaf1fd7a9 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Fri, 24 Jan 2025 20:49:14 +0530
Subject: [PATCH] [AMDGPU] Optimizing Dynamic Alloca S-DAG I-Sel To get the
allocation size for dynamic allocas, first we scale the `max_size` by the
`wave_size`, and then bump SP by this amount. ``` addr = sp size = max_size
<< wave_size sp = addr + size ``` Currently, during `DAG->DAG ISel`, the
scalar left-shift and addition operations are being folded into a
`v_lshl_add_u32` instruction and then `readfirstlane` is used to copy the
value into SP.
This patch enforces the use of scalar operations,
removing the need for a `readfirstlane` instr.
This makes the code-gen for S-DAG and G-ISEL
uniform.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 +-
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 373 +++++++++---------
2 files changed, 188 insertions(+), 198 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1aeca7f370aa1b..6427e4e9f6a2ec 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4088,15 +4088,12 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
Size, DAG.getConstant(0, dl, MVT::i32));
- SDValue ScaledSize = DAG.getNode(
- ISD::SHL, dl, VT, Size,
+ SDNode *ScaledSize = DAG.getMachineNode(
+ AMDGPU::S_LSHL_B32, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
- NewSP =
- DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
- SDValue ReadFirstLaneID =
- DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
- NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
- NewSP);
+ NewSP = {DAG.getMachineNode(AMDGPU::S_ADD_I32, dl, VT, BaseAddr,
+ SDValue(ScaledSize, 0)),
+ 0};
}
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0b..edcf4c29102c47 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -269,12 +269,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -325,12 +324,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
@@ -386,13 +386,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -446,11 +446,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -507,12 +507,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -561,12 +560,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
@@ -609,7 +609,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s9, 0
; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
@@ -621,29 +621,28 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0xfffff000
; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT: s_add_i32 s32, s8, s5
; GFX9-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX9-SDAG-NEXT: s_ff1_i32_b64 s5, s[6:7]
; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s5
; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s5
-; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_max_u32 s9, s9, s10
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 3
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-SDAG-NEXT: s_lshl_b32 s5, s9, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 4
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT: s_add_i32 s32, s6, s5
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
@@ -749,19 +748,18 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-SDAG-NEXT: s_lshl_b32 s2, s2, 5
; GFX11-SDAG-NEXT: s_mov_b32 s3, s32
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s3
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s3 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s3, s2
; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
@@ -866,11 +864,10 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
; GFX9-SDAG-NEXT: ; %bb.3:
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s4, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT: s_add_i32 s32, s6, s4
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5
@@ -967,12 +964,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s2, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s2
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2
@@ -1077,12 +1075,11 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s33
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1137,12 +1134,13 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s33
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1204,13 +1202,13 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10
-; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
; GFX9-SDAG-NEXT: s_mov_b32 s34, s10
@@ -1274,13 +1272,13 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
; GFX11-SDAG-NEXT: ; %bb.2:
; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
; GFX11-SDAG-NEXT: s_mov_b32 s34, s5
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1342,12 +1340,11 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s33
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1402,12 +1399,13 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s33
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1467,12 +1465,11 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s33
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1530,12 +1527,13 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s33
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1603,11 +1601,11 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s7, 6
+; GFX9-SDAG-NEXT: s_add_i32 s32, s6, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
; GFX9-SDAG-NEXT: s_mov_b32 s34, s11
@@ -1675,14 +1673,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_mov_b32 s34, s6
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1747,12 +1744,11 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s33
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1810,12 +1806,13 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s33
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1876,22 +1873,21 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s9, 0
; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9
-; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s10, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s10
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s10
+; GFX9-SDAG-NEXT: s_max_u32 s9, s9, s11
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: s_add_i32 s7, s32, 0xfff
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s9, 6
+; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 0xfffff000
; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s6
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
@@ -1903,16 +1899,15 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
; GFX9-SDAG-NEXT: ; %bb.5:
-; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 3
; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s10, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s32
; GFX9-SDAG-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 4
-; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_add_i32 s32, s7, s6
+; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s7
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -1928,14 +1923,13 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
; GFX9-SDAG-NEXT: ; %bb.8:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s8, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
; GFX9-SDAG-NEXT: s_mov_b32 s34, s14
@@ -2039,30 +2033,29 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s2, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4
-; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4
+; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
; GFX11-SDAG-NEXT: ; %bb.3:
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_lshl_b32 s4, s2, 5
+; GFX11-SDAG-NEXT: s_and_b32 s2, s3, 0xfffff800
; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
; GFX11-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4
@@ -2074,14 +2067,14 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
; GFX11-SDAG-NEXT: ; %bb.5:
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-SDAG-NEXT: s_lshl_b32 s3, s3, 5
; GFX11-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s3, 5, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: s_add_i32 s32, s4, s3
; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1
; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15
@@ -2099,14 +2092,14 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
; GFX11-SDAG-NEXT: ; %bb.8:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
; GFX11-SDAG-NEXT: s_mov_b32 s34, s8
; GFX11-SDAG-NEXT: s_mov_b32 s33, s7
@@ -2242,11 +2235,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT: s_add_i32 s7, s32, 0xfff
+; GFX9-SDAG-NEXT: s_and_b32 s7, s7, 0xfffff000
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s8, 6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT: s_add_i32 s32, s7, s6
; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31
@@ -2267,12 +2260,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
; GFX9-SDAG-NEXT: ; %bb.7:
-; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s8, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s32
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_add_i32 s32, s7, s6
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s7
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -2380,13 +2372,13 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
; GFX11-SDAG-NEXT: ; %bb.3:
; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s1
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow
; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8
@@ -2407,12 +2399,13 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
; GFX11-SDAG-NEXT: ; %bb.7:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-SDAG-NEXT: s_mov_b32 s2, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2
; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s34
@@ -2527,12 +2520,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s33
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -2590,12 +2582,13 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s33
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2656,12 +2649,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s32, s33
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -2716,12 +2708,13 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_mov_b32 s32, s33
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list