[llvm] [WIP] Optimizing dynamic alloca ISEL (PR #123746)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 21 04:51:51 PST 2025
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/123746
None
>From e5aad111a9e43c85ade4a60b468ccac56a40dc5b Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 21 Jan 2025 18:20:18 +0530
Subject: [PATCH] Optimizing dynamic alloca ISEL
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 +-
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 377 +++++++++---------
2 files changed, 183 insertions(+), 207 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6cf5774fc53b06..34f46c89484105 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4081,15 +4081,12 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
Size, DAG.getConstant(0, dl, MVT::i32));
- SDValue ScaledSize = DAG.getNode(
- ISD::SHL, dl, VT, Size,
+ SDNode *ScaledSize = DAG.getMachineNode(
+ AMDGPU::S_LSHL_B32, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
- NewSP =
- DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
- SDValue ReadFirstLaneID =
- DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
- NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
- NewSP);
+ NewSP = {DAG.getMachineNode(AMDGPU::S_ADD_I32, dl, VT, BaseAddr,
+ SDValue(ScaledSize, 0)),
+ 0};
}
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 9acb3a42ae102c..0b11a6db23b05a 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -269,12 +269,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -325,12 +324,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
@@ -386,13 +386,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -446,11 +446,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -507,12 +507,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -561,12 +560,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
@@ -609,7 +609,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s9, 0
; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
@@ -621,29 +621,28 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0xfffff000
; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT: s_add_i32 s32, s8, s5
; GFX9-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX9-SDAG-NEXT: s_ff1_i32_b64 s5, s[6:7]
; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s5
; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s5
-; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_max_u32 s9, s9, s10
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 3
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-SDAG-NEXT: s_lshl_b32 s5, s9, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 4
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT: s_add_i32 s32, s6, s5
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1
; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
@@ -749,19 +748,18 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-SDAG-NEXT: s_lshl_b32 s2, s2, 5
; GFX11-SDAG-NEXT: s_mov_b32 s3, s32
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s3
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s3 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s3, s2
; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
@@ -866,11 +864,10 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
; GFX9-SDAG-NEXT: ; %bb.3:
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s4, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT: s_add_i32 s32, s6, s4
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5
@@ -967,12 +964,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s2, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s2
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5
; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2
@@ -1077,12 +1075,11 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1137,14 +1134,13 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1202,13 +1198,13 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10
-; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1267,14 +1263,13 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
; GFX11-SDAG-NEXT: ; %bb.2:
; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1333,12 +1328,11 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1393,14 +1387,13 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1458,12 +1451,11 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1521,14 +1513,13 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1592,11 +1583,11 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
-; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s7, 6
+; GFX9-SDAG-NEXT: s_add_i32 s32, s6, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000
; GFX9-SDAG-NEXT: s_mov_b32 s33, s10
@@ -1658,14 +1649,14 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
; GFX11-SDAG-NEXT: ; %bb.2:
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
@@ -1727,12 +1718,11 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -1790,14 +1780,13 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1854,22 +1843,21 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s9, 0
; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9
-; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s10, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s10
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s10
+; GFX9-SDAG-NEXT: s_max_u32 s9, s9, s11
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: s_add_i32 s7, s32, 0xfff
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s9, 6
+; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 0xfffff000
; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s6
; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
@@ -1881,16 +1869,15 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
; GFX9-SDAG-NEXT: ; %bb.5:
-; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 3
; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s10, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s32
; GFX9-SDAG-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 4
-; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_add_i32 s32, s7, s6
+; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s7
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -1906,14 +1893,13 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
; GFX9-SDAG-NEXT: ; %bb.8:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s8, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xd000
; GFX9-SDAG-NEXT: s_mov_b32 s33, s13
@@ -2011,30 +1997,29 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6
; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s2, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4
-; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4
+; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
; GFX11-SDAG-NEXT: ; %bb.3:
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_lshl_b32 s4, s2, 5
+; GFX11-SDAG-NEXT: s_and_b32 s2, s3, 0xfffff800
; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
; GFX11-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4
@@ -2046,14 +2031,14 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
; GFX11-SDAG-NEXT: ; %bb.5:
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-SDAG-NEXT: s_lshl_b32 s3, s3, 5
; GFX11-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s3, 5, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: s_add_i32 s32, s4, s3
; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1
; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15
@@ -2071,16 +2056,15 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
; GFX11-SDAG-NEXT: ; %bb.8:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
; GFX11-SDAG-NEXT: s_mov_b32 s33, s7
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff40
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -2209,11 +2193,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
; GFX9-SDAG-NEXT: ; %bb.3:
-; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT: s_add_i32 s7, s32, 0xfff
+; GFX9-SDAG-NEXT: s_and_b32 s7, s7, 0xfffff000
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s8, 6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT: s_add_i32 s32, s7, s6
; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31
@@ -2234,12 +2218,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
; GFX9-SDAG-NEXT: ; %bb.7:
-; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s6, s8, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s32
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_add_i32 s32, s7, s6
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s7
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -2341,13 +2324,13 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
; GFX11-SDAG-NEXT: ; %bb.3:
; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s1
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow
; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8
@@ -2368,15 +2351,15 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
; GFX11-SDAG-NEXT: ; %bb.7:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5
; GFX11-SDAG-NEXT: s_mov_b32 s2, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2
; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff80
; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2485,12 +2468,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -2548,14 +2530,13 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -2614,12 +2595,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-SDAG-NEXT: ; %bb.2:
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
-; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
@@ -2674,14 +2654,13 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
-; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list