[llvm] [MachineScheduler] Fix physreg dependencies of ExitSU (PR #123541)
Sergei Barannikov via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 00:21:23 PST 2025
https://github.com/s-barannikov updated https://github.com/llvm/llvm-project/pull/123541
>From 05cca1be4c7ea4279f028f4b90805db0ef46b0cf Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Mon, 20 Jan 2025 05:40:22 +0300
Subject: [PATCH 1/5] [MachineScheduler] Fix physreg dependencies of ExitSU
Providing the correct operand index allows addPhysRegDataDeps
to compute the correct latency.
---
llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 8 +-
.../CodeGen/AMDGPU/GlobalISel/assert-align.ll | 10 +-
.../GlobalISel/call-outgoing-stack-args.ll | 14 +-
...vergence-divergent-i1-used-outside-loop.ll | 2 +-
.../AMDGPU/GlobalISel/regbankselect-mui.ll | 4 +-
.../abi-attribute-hints-undefined-behavior.ll | 26 +-
.../atomic_optimizations_global_pointer.ll | 18 +-
.../atomic_optimizations_local_pointer.ll | 78 +-
llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 126 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 1347 ++++++++---------
.../CodeGen/AMDGPU/call-reqd-group-size.ll | 8 +-
llvm/test/CodeGen/AMDGPU/call-waitcnt.ll | 28 +-
llvm/test/CodeGen/AMDGPU/cc-update.ll | 112 +-
.../AMDGPU/cross-block-use-is-not-abi-copy.ll | 24 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 4 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 4 +-
...frame-setup-without-sgpr-to-vgpr-spills.ll | 10 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 82 +-
.../AMDGPU/global_atomics_scan_fmax.ll | 22 +-
.../AMDGPU/global_atomics_scan_fmin.ll | 22 +-
.../AMDGPU/global_atomics_scan_fsub.ll | 82 +-
...ne-sink-temporal-divergence-swdev407790.ll | 126 +-
.../AMDGPU/set-inactive-wwm-overwrite.ll | 4 +-
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 +-
...tack-pointer-offset-relative-frameindex.ll | 6 +-
llvm/test/CodeGen/AMDGPU/swdev373493.ll | 12 +-
.../AMDGPU/tail-call-inreg-arguments.ll | 2 +-
.../AMDGPU/tuple-allocation-failure.ll | 24 +-
llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll | 6 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +-
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 2 +-
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 12 +-
.../CodeGen/ARM/arm-shrink-wrapping-linux.ll | 4 +-
llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll | 4 +-
llvm/test/CodeGen/ARM/vector-DAGCombine.ll | 2 +-
llvm/test/CodeGen/PowerPC/p10-spill-creq.ll | 8 +-
.../Thumb2/LowOverheadLoops/fast-fp-loops.ll | 2 +-
.../Thumb2/LowOverheadLoops/minloop.ll | 2 +-
.../LowOverheadLoops/mve-float-loops.ll | 6 +-
.../LowOverheadLoops/mve-tail-data-types.ll | 6 +-
.../Thumb2/LowOverheadLoops/reductions.ll | 2 +-
.../CodeGen/Thumb2/mve-float32regloops.ll | 8 +-
.../CodeGen/Thumb2/mve-gather-increment.ll | 2 +-
.../Thumb2/mve-gather-scatter-optimisation.ll | 2 +-
.../Thumb2/mve-laneinterleaving-reduct.ll | 2 +-
llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll | 18 +-
.../CodeGen/Thumb2/mve-postinc-distribute.ll | 2 +-
llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll | 8 +-
.../test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll | 2 +-
.../Thumb2/mve-tailpred-nonzerostart.ll | 2 +-
.../test/CodeGen/Thumb2/pacbti-m-varargs-2.ll | 2 +-
llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll | 2 +-
llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll | 2 +-
53 files changed, 1145 insertions(+), 1174 deletions(-)
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 8e3e06bf57153e..b2c3a0109b3a91 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -209,13 +209,17 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
ExitSU.setInstr(ExitMI);
// Add dependencies on the defs and uses of the instruction.
if (ExitMI) {
+ const MCInstrDesc &MIDesc = ExitMI->getDesc();
for (const MachineOperand &MO : ExitMI->all_uses()) {
+ unsigned OpIdx = MO.getOperandNo();
Register Reg = MO.getReg();
if (Reg.isPhysical()) {
+ bool IsRealUse = OpIdx < MIDesc.getNumOperands() ||
+ MIDesc.hasImplicitUseOfPhysReg(Reg);
for (MCRegUnit Unit : TRI->regunits(Reg))
- Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
+ Uses.insert(PhysRegSUOper(&ExitSU, IsRealUse ? OpIdx : -1, Unit));
} else if (Reg.isVirtual() && MO.readsReg()) {
- addVRegUseDeps(&ExitSU, MO.getOperandNo());
+ addVRegUseDeps(&ExitSU, OpIdx);
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 604caf572b0fe8..830e932570b82b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -15,12 +15,12 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, ext at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, ext at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -45,11 +45,11 @@ define ptr addrspace(1) @tail_call_assert_align() {
; CHECK-LABEL: tail_call_assert_align:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, ext at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, ext at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[16:17]
entry:
%call = tail call align 4 ptr addrspace(1) @ext(ptr addrspace(1) null)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 974ce492daea8b..686e7db6facd4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -44,8 +44,8 @@ define amdgpu_kernel void @kernel_caller_stack() {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
-; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s2, s32, 16
+; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
@@ -239,11 +239,11 @@ define void @func_caller_stack() {
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; MUBUF-NEXT: v_mov_b32_e32 v0, 12
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
@@ -274,15 +274,15 @@ define void @func_caller_stack() {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
-; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
@@ -312,10 +312,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; MUBUF-NEXT: s_addk_i32 s32, 0x400
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(1)
@@ -394,8 +394,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
-; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: s_add_u32 s2, s32, 56
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 935200d5953072..91e16d91ddd15a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index 287a8ab0e52f52..f1aaf1d59a5dc8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -380,8 +380,8 @@ define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %
define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) {
; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs:
; OLD_RBS: ; %bb.0: ; %A
-; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
; OLD_RBS-NEXT: ; %bb.1: ; %B
; OLD_RBS-NEXT: s_mov_b32 s0, 1
@@ -393,8 +393,8 @@ define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1
;
; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs:
; NEW_RBS: ; %bb.0: ; %A
-; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
; NEW_RBS-NEXT: ; %bb.1: ; %B
; NEW_RBS-NEXT: s_mov_b32 s0, 1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..8083e8e717f6be 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -25,10 +25,10 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2
; FIXEDABI-NEXT: s_addk_i32 s32, 0x400
; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0
-; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_getpc_b64 s[16:17]
; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs at rel32@lo+4
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs at rel32@hi+12
+; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
@@ -49,21 +49,21 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8
+; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
+; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
+; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2
; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6
; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7
; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5]
-; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
-; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
; FIXEDABI-SDAG-NEXT: s_endpgm
;
@@ -71,21 +71,21 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
-; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9
+; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8
; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1
+; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
+; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
+; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0
; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6
; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7
; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0
-; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5]
-; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs at rel32@lo+4
-; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs at rel32@hi+12
; FIXEDABI-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
; FIXEDABI-GISEL-NEXT: s_endpgm
call void @requires_all_inputs()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9577230c6c52e2..d9ec61b709236f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1291,9 +1291,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1417,9 +1417,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1545,9 +1546,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -3140,8 +3141,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -4846,9 +4847,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s4, s6
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -4972,9 +4973,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5100,9 +5102,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -6734,8 +6736,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 45b161d7959f4f..6d6fb6a7db4c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -919,9 +919,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1030,9 +1030,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -2630,8 +2630,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -2812,8 +2812,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -3301,8 +3301,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
@@ -4341,9 +4341,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2
@@ -4452,9 +4452,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2
@@ -6079,8 +6079,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2
@@ -6261,8 +6261,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2
@@ -6761,9 +6761,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2
@@ -6872,9 +6872,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2
@@ -7468,8 +7469,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2
@@ -7625,8 +7626,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2
@@ -8126,9 +8127,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2
@@ -8237,9 +8238,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2
@@ -8832,8 +8833,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2
@@ -8989,8 +8990,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2
@@ -9490,9 +9491,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2
@@ -9601,9 +9602,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2
@@ -10196,8 +10197,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2
@@ -10353,8 +10354,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2
@@ -10853,9 +10854,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2
@@ -10964,9 +10965,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2
@@ -11971,8 +11973,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2
@@ -12185,8 +12187,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2
@@ -12686,9 +12688,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2
@@ -12797,9 +12799,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2
@@ -13804,8 +13807,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2
@@ -14018,8 +14021,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2
@@ -14520,9 +14523,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2
@@ -14631,9 +14634,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2
@@ -15629,8 +15632,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2
@@ -15837,8 +15840,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2
@@ -16338,9 +16341,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2
@@ -16449,9 +16452,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s0, s2
; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2
@@ -17446,8 +17450,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2
@@ -17654,8 +17658,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX1132_DPP-NEXT: s_mov_b32 s2, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 93a4469c7718ea..1bb1209b6118bf 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -43,11 +43,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i8_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -103,11 +103,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -163,11 +163,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -223,12 +223,12 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -284,12 +284,12 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -345,13 +345,13 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s19, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -407,14 +407,14 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s20, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -470,6 +470,9 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s24, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[24:25]
+; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s1, s17
@@ -479,9 +482,6 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
; GFX9-NEXT: s_mov_b32 s18, s22
; GFX9-NEXT: s_mov_b32 s19, s23
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[24:25]
-; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -537,11 +537,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -597,11 +597,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -657,11 +657,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f32_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -717,12 +717,12 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -778,11 +778,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -839,11 +839,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -899,12 +899,12 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -960,12 +960,12 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1021,12 +1021,12 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1082,12 +1082,12 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1143,11 +1143,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
; GFX9-NEXT: v_writelane_b32 v40, s17, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p3_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1203,14 +1203,14 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
; GFX9-NEXT: v_writelane_b32 v40, s20, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[20:21]
+; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1266,12 +1266,12 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
; GFX9-NEXT: v_writelane_b32 v40, s18, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg at rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s1, s17
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1327,15 +1327,15 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
; GFX9-NEXT: v_writelane_b32 v40, s21, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[22:23]
+; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: s_mov_b32 s16, s20
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[22:23]
-; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1391,6 +1391,9 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; GFX9-NEXT: v_writelane_b32 v40, s29, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 vcc
+; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s1, s17
@@ -1405,9 +1408,6 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; GFX9-NEXT: s_mov_b32 s23, s27
; GFX9-NEXT: s_mov_b32 s24, s28
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 vcc
-; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], vcc
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -1465,6 +1465,9 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
; GFX9-NEXT: v_writelane_b32 v40, s21, 2
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[22:23]
+; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
; GFX9-NEXT: s_mov_b32 s3, s7
; GFX9-NEXT: s_mov_b32 s2, s6
; GFX9-NEXT: s_mov_b32 s1, s5
@@ -1480,9 +1483,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
; GFX9-NEXT: s_mov_b32 s15, s19
; GFX9-NEXT: s_mov_b32 s16, s20
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[22:23]
-; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 26ab0f3ce63559..332509b3cbffda 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: v_mov_b32_e32 v0, 1
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -90,12 +90,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 1
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: v_mov_b32_e32 v0, 1
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -109,24 +109,23 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 1
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -135,14 +134,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: v_mov_b32_e32 v0, 1
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: v_mov_b32_e32 v0, 1
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_i1(i1 true)
@@ -164,11 +163,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 1
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
@@ -187,11 +186,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: v_bfe_i32 v0, v0, 0, 1
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
@@ -210,11 +209,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
@@ -226,10 +225,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -244,12 +243,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
@@ -274,11 +273,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
@@ -297,11 +296,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: v_and_b32_e32 v0, 1, v0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
@@ -320,11 +319,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
@@ -336,10 +335,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -354,12 +353,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_zeroext at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_zeroext at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: v_and_b32_e32 v0, 1, v0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
@@ -379,12 +378,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 0x7b
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: v_mov_b32_e32 v0, 0x7b
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -398,12 +397,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 0x7b
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: v_mov_b32_e32 v0, 0x7b
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -417,24 +416,23 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i8_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -444,13 +442,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_i8(i8 123)
@@ -473,11 +471,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -495,11 +493,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -517,11 +515,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -532,11 +530,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -550,12 +547,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_signext at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_signext at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%var = load volatile i8, ptr addrspace(1) undef
@@ -578,11 +575,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -600,11 +597,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -622,11 +619,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -637,11 +634,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -655,12 +651,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_zeroext at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_zeroext at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%var = load volatile i8, ptr addrspace(1) undef
@@ -679,12 +675,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 0x7b
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: v_mov_b32_e32 v0, 0x7b
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -698,12 +694,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 0x7b
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: v_mov_b32_e32 v0, 0x7b
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -717,24 +713,23 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i16_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -743,14 +738,14 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_i16(i16 123)
@@ -772,11 +767,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -794,11 +789,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -816,11 +811,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -831,11 +826,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -849,12 +843,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_signext at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_signext at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%var = load volatile i16, ptr addrspace(1) undef
@@ -877,11 +871,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -899,11 +893,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -921,11 +915,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -936,11 +930,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -954,12 +947,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_zeroext at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_zeroext at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%var = load volatile i16, ptr addrspace(1) undef
@@ -978,12 +971,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -997,12 +990,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1016,24 +1009,23 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i32_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1043,13 +1035,13 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: v_mov_b32_e32 v0, 42
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: v_mov_b32_e32 v0, 42
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_i32(i32 42)
@@ -1067,13 +1059,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0x7b
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1087,13 +1079,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1107,25 +1099,24 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i64_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1134,15 +1125,15 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_i64(i64 123)
@@ -1165,11 +1156,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1188,11 +1179,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1211,11 +1202,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1225,13 +1216,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s5, s4
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1245,13 +1235,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; HSA-NEXT: s_mov_b32 s10, -1
; HSA-NEXT: s_mov_b32 s9, s8
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) null
@@ -1270,15 +1260,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: v_mov_b32_e32 v1, 2
; VI-NEXT: v_mov_b32_e32 v2, 3
; VI-NEXT: v_mov_b32_e32 v3, 4
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1292,15 +1282,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
; CI-NEXT: v_mov_b32_e32 v2, 3
; CI-NEXT: v_mov_b32_e32 v3, 4
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1314,15 +1304,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
; GFX9-NEXT: v_mov_b32_e32 v2, 3
; GFX9-NEXT: v_mov_b32_e32 v3, 4
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1330,12 +1320,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1344,17 +1333,17 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
@@ -1377,13 +1366,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v4, 1
; VI-NEXT: v_mov_b32_e32 v5, 2
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1402,13 +1391,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v4, 1
; CI-NEXT: v_mov_b32_e32 v5, 2
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1427,13 +1416,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v4, 1
; GFX9-NEXT: v_mov_b32_e32 v5, 2
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1445,12 +1434,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: s_mov_b32 s5, s4
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1464,15 +1452,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; HSA-NEXT: s_mov_b32 s10, -1
; HSA-NEXT: s_mov_b32 s9, s8
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64 at rel32@hi+12
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v5, 2
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%load = load <2 x i64>, ptr addrspace(1) null
@@ -1498,15 +1486,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v4, 1
; VI-NEXT: v_mov_b32_e32 v5, 2
; VI-NEXT: v_mov_b32_e32 v6, 3
; VI-NEXT: v_mov_b32_e32 v7, 4
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1525,15 +1513,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v4, 1
; CI-NEXT: v_mov_b32_e32 v5, 2
; CI-NEXT: v_mov_b32_e32 v6, 3
; CI-NEXT: v_mov_b32_e32 v7, 4
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1552,15 +1540,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v4, 1
; GFX9-NEXT: v_mov_b32_e32 v5, 2
; GFX9-NEXT: v_mov_b32_e32 v6, 3
; GFX9-NEXT: v_mov_b32_e32 v7, 4
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1573,12 +1561,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1592,17 +1579,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; HSA-NEXT: s_mov_b32 s10, -1
; HSA-NEXT: s_mov_b32 s9, s8
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64 at rel32@hi+12
; HSA-NEXT: v_mov_b32_e32 v4, 1
; HSA-NEXT: v_mov_b32_e32 v5, 2
; HSA-NEXT: v_mov_b32_e32 v6, 3
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v7, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%load = load <2 x i64>, ptr addrspace(1) null
@@ -1622,12 +1609,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: v_mov_b32_e32 v0, 0x4400
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1641,12 +1628,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 4.0
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: v_mov_b32_e32 v0, 4.0
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1660,24 +1647,23 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f16_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1686,14 +1672,14 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: v_mov_b32_e32 v0, 0x4400
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: v_mov_b32_e32 v0, 0x4400
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_f16(half 4.0)
@@ -1711,12 +1697,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: v_mov_b32_e32 v0, 4.0
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: v_mov_b32_e32 v0, 4.0
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1730,12 +1716,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 4.0
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: v_mov_b32_e32 v0, 4.0
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1749,24 +1735,23 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f32_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1775,14 +1760,14 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: v_mov_b32_e32 v0, 4.0
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: v_mov_b32_e32 v0, 4.0
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_f32(float 4.0)
@@ -1800,13 +1785,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1820,13 +1805,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 2.0
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1840,25 +1825,24 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1867,15 +1851,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
@@ -1893,14 +1877,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
; VI-NEXT: v_mov_b32_e32 v2, 4.0
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -1914,14 +1898,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 2.0
; CI-NEXT: v_mov_b32_e32 v2, 4.0
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -1935,14 +1919,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
; GFX9-NEXT: v_mov_b32_e32 v2, 4.0
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1950,12 +1934,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -1964,16 +1947,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 4.0
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
@@ -1991,6 +1974,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
@@ -1998,9 +1984,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; VI-NEXT: v_mov_b32_e32 v3, -1.0
; VI-NEXT: v_mov_b32_e32 v4, 0.5
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2014,6 +1997,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2021,9 +2007,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; CI-NEXT: v_mov_b32_e32 v3, -1.0
; CI-NEXT: v_mov_b32_e32 v4, 0.5
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -2037,6 +2020,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2044,9 +2030,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v3, -1.0
; GFX9-NEXT: v_mov_b32_e32 v4, 0.5
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2055,12 +2038,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
; GFX11-NEXT: v_mov_b32_e32 v4, 0.5
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2069,8 +2051,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1.0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2078,9 +2063,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v3, -1.0
; HSA-NEXT: v_mov_b32_e32 v4, 0.5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
@@ -2098,13 +2080,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40100000
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2118,13 +2100,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0x40100000
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -2138,25 +2120,24 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f64_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2165,15 +2146,15 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_f64(double 4.0)
@@ -2191,15 +2172,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
; VI-NEXT: v_mov_b32_e32 v2, 0
-; VI-NEXT: v_mov_b32_e32 v3, 0x40100000
-; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; VI-NEXT: v_mov_b32_e32 v3, 0x40100000
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2213,15 +2194,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 2.0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v3, 0x40100000
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -2235,15 +2216,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2251,12 +2232,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2265,17 +2245,17 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
; HSA-NEXT: v_mov_b32_e32 v2, 0
; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
@@ -2293,6 +2273,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2301,9 +2284,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0x40200000
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2317,6 +2297,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2325,9 +2308,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; CI-NEXT: v_mov_b32_e32 v4, 0
; CI-NEXT: v_mov_b32_e32 v5, 0x40200000
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -2341,6 +2321,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2349,9 +2332,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2360,12 +2340,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2374,8 +2353,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 2.0
@@ -2384,9 +2366,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v4, 0
; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
@@ -2407,11 +2386,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2428,11 +2407,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -2451,11 +2430,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2465,11 +2444,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2483,11 +2461,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <2 x i16>, ptr addrspace(1) undef
@@ -2509,11 +2487,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2530,11 +2508,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16
; CI-NEXT: v_mov_b32_e32 v0, v2
@@ -2555,11 +2533,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2569,11 +2547,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2587,11 +2564,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <3 x i16>, ptr addrspace(1) undef
@@ -2613,11 +2590,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2634,11 +2611,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, v1
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -2660,11 +2637,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2674,11 +2651,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2692,11 +2668,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <3 x half>, ptr addrspace(1) undef
@@ -2715,13 +2691,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0x20001
; VI-NEXT: v_mov_b32_e32 v1, 3
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2735,14 +2711,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
; CI-NEXT: v_mov_b32_e32 v2, 3
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -2756,25 +2732,24 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: v_mov_b32_e32 v1, 3
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2783,15 +2758,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 3
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
@@ -2809,13 +2784,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00
; VI-NEXT: v_mov_b32_e32 v1, 0x4400
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2829,14 +2804,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 2.0
; CI-NEXT: v_mov_b32_e32 v2, 4.0
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -2850,13 +2825,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2864,12 +2839,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2878,15 +2852,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00
; HSA-NEXT: v_mov_b32_e32 v1, 0x4400
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
@@ -2907,11 +2881,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -2928,11 +2902,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
@@ -2954,11 +2928,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -2968,11 +2942,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -2986,11 +2959,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <4 x i16>, ptr addrspace(1) undef
@@ -3009,13 +2982,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 0x20001
; VI-NEXT: v_mov_b32_e32 v1, 0x40003
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3029,15 +3002,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
; CI-NEXT: v_mov_b32_e32 v2, 3
; CI-NEXT: v_mov_b32_e32 v3, 4
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3051,13 +3024,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3065,12 +3038,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3079,15 +3051,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 0x20001
; HSA-NEXT: v_mov_b32_e32 v1, 0x40003
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
@@ -3108,11 +3080,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3129,11 +3101,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, v1
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -3154,11 +3126,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3168,11 +3140,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3186,11 +3157,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f16 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f16 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <2 x half>, ptr addrspace(1) undef
@@ -3212,11 +3183,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3233,11 +3204,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3254,11 +3225,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3268,11 +3239,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3286,11 +3256,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <2 x i32>, ptr addrspace(1) undef
@@ -3309,13 +3279,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: v_mov_b32_e32 v1, 2
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3329,13 +3299,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3349,25 +3319,24 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3376,15 +3345,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
@@ -3402,14 +3371,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 3
; VI-NEXT: v_mov_b32_e32 v1, 4
; VI-NEXT: v_mov_b32_e32 v2, 5
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3423,14 +3392,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 3
; CI-NEXT: v_mov_b32_e32 v1, 4
; CI-NEXT: v_mov_b32_e32 v2, 5
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3444,14 +3413,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: v_mov_b32_e32 v2, 5
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3459,12 +3428,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_mov_b32_e32 v2, 5
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3474,15 +3442,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
; HSA-NEXT: v_mov_b32_e32 v2, 5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
@@ -3500,15 +3468,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 3
; VI-NEXT: v_mov_b32_e32 v1, 4
; VI-NEXT: v_mov_b32_e32 v2, 5
; VI-NEXT: v_mov_b32_e32 v3, 6
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3522,15 +3490,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 3
; CI-NEXT: v_mov_b32_e32 v1, 4
; CI-NEXT: v_mov_b32_e32 v2, 5
; CI-NEXT: v_mov_b32_e32 v3, 6
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3544,15 +3512,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: v_mov_b32_e32 v2, 5
; GFX9-NEXT: v_mov_b32_e32 v3, 6
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3560,12 +3528,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3575,16 +3542,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 3
; HSA-NEXT: v_mov_b32_e32 v1, 4
; HSA-NEXT: v_mov_b32_e32 v2, 5
; HSA-NEXT: v_mov_b32_e32 v3, 6
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
@@ -3605,11 +3572,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3626,11 +3593,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3647,11 +3614,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3661,11 +3628,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3679,11 +3645,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; HSA-NEXT: s_add_u32 s0, s0, s9
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%val = load <4 x i32>, ptr addrspace(1) undef
@@ -3702,15 +3668,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: v_mov_b32_e32 v1, 2
; VI-NEXT: v_mov_b32_e32 v2, 3
; VI-NEXT: v_mov_b32_e32 v3, 4
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3724,15 +3690,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
; CI-NEXT: v_mov_b32_e32 v2, 3
; CI-NEXT: v_mov_b32_e32 v3, 4
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3746,15 +3712,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
; GFX9-NEXT: v_mov_b32_e32 v2, 3
; GFX9-NEXT: v_mov_b32_e32 v3, 4
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3762,12 +3728,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3776,17 +3741,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
; HSA-NEXT: v_mov_b32_e32 v2, 3
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
@@ -3804,6 +3769,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: v_mov_b32_e32 v1, 2
@@ -3811,9 +3779,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; VI-NEXT: v_mov_b32_e32 v3, 4
; VI-NEXT: v_mov_b32_e32 v4, 5
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3827,6 +3792,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
@@ -3834,9 +3802,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; CI-NEXT: v_mov_b32_e32 v3, 4
; CI-NEXT: v_mov_b32_e32 v4, 5
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3850,6 +3815,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
@@ -3857,9 +3825,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v3, 4
; GFX9-NEXT: v_mov_b32_e32 v4, 5
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3868,12 +3833,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT: v_mov_b32_e32 v4, 5
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3882,8 +3846,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -3891,9 +3858,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v3, 4
; HSA-NEXT: v_mov_b32_e32 v4, 5
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
@@ -3917,11 +3881,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -3941,11 +3905,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -3965,11 +3929,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -3978,10 +3942,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
@@ -3998,16 +3962,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%ptr = load ptr addrspace(1), ptr addrspace(4) undef
@@ -4027,6 +3991,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: v_mov_b32_e32 v1, 2
@@ -4037,9 +4004,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; VI-NEXT: v_mov_b32_e32 v6, 7
; VI-NEXT: v_mov_b32_e32 v7, 8
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -4053,6 +4017,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: v_mov_b32_e32 v0, 1
; CI-NEXT: v_mov_b32_e32 v1, 2
@@ -4063,9 +4030,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; CI-NEXT: v_mov_b32_e32 v6, 7
; CI-NEXT: v_mov_b32_e32 v7, 8
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -4079,6 +4043,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, 2
@@ -4089,9 +4056,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-NEXT: v_mov_b32_e32 v6, 7
; GFX9-NEXT: v_mov_b32_e32 v7, 8
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -4101,12 +4065,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -4115,8 +4078,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-NEXT: s_add_i32 s6, s6, s9
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; HSA-NEXT: s_add_u32 s0, s0, s9
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 1
; HSA-NEXT: v_mov_b32_e32 v1, 2
@@ -4127,9 +4093,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-NEXT: v_mov_b32_e32 v6, 7
; HSA-NEXT: v_mov_b32_e32 v7, 8
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
@@ -4155,11 +4118,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -4181,11 +4144,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -4207,11 +4170,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -4220,10 +4183,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+12
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
@@ -4242,18 +4205,18 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%ptr = load ptr addrspace(1), ptr addrspace(4) undef
@@ -4286,10 +4249,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[8:9]
; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; VI-NEXT: s_swappc_b64 s[30:31], s[8:9]
@@ -4318,10 +4281,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[8:9]
; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32
; CI-NEXT: s_swappc_b64 s[30:31], s[8:9]
@@ -4350,10 +4313,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
@@ -4392,7 +4355,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
@@ -4402,12 +4365,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_getpc_b64 s[12:13]
; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_waitcnt vmcnt(7)
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32
; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13]
@@ -4443,10 +4406,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
; VI-NEXT: s_mov_b32 s32, 0
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4
; VI-NEXT: s_waitcnt vmcnt(8)
@@ -4478,10 +4441,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
; CI-NEXT: s_mov_b32 s32, 0
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4
; CI-NEXT: s_waitcnt vmcnt(8)
@@ -4513,10 +4476,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(8)
@@ -4572,12 +4535,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_waitcnt vmcnt(8)
; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
; HSA-NEXT: s_waitcnt vmcnt(8)
@@ -4603,14 +4566,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; VI-NEXT: s_addc_u32 s41, s41, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[40:41]
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[42:43]
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_mov_b32 s39, 0xf000
; VI-NEXT: s_mov_b32 s38, -1
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4627,14 +4590,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b32 s39, 0xf000
; CI-NEXT: s_mov_b32 s38, -1
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4651,14 +4614,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX9-NEXT: s_addc_u32 s41, s41, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_mov_b32 s39, 0xf000
; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -4668,14 +4631,13 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[36:37], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+12
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_mov_b32 s39, 0x31016000
; GFX11-NEXT: s_mov_b32 s38, -1
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -4684,19 +4646,19 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; HSA-LABEL: test_call_external_i32_func_i32_imm:
; HSA: ; %bb.0:
; HSA-NEXT: s_add_i32 s8, s8, s11
-; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0
; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0
; HSA-NEXT: s_add_u32 s0, s0, s11
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT: s_getpc_b64 s[8:9]
+; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32 at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, 42
; HSA-NEXT: s_mov_b32 s32, 0
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_mov_b32 s39, 0x1100f000
; HSA-NEXT: s_mov_b32 s38, -1
-; HSA-NEXT: s_getpc_b64 s[8:9]
-; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0
; HSA-NEXT: s_waitcnt vmcnt(0)
@@ -4723,11 +4685,11 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -4747,11 +4709,11 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -4771,11 +4733,11 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -4784,10 +4746,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_u8 v0, off, s[4:7], 0
@@ -4804,16 +4766,16 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT: s_addc_u32 s1, s1, 0
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; HSA-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4
-; HSA-NEXT: s_addc_u32 s1, s1, 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
@@ -4840,10 +4802,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
; VI-NEXT: s_movk_i32 s32, 0x400
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -4868,10 +4830,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
; CI-NEXT: s_movk_i32 s32, 0x400
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
; CI-NEXT: s_waitcnt vmcnt(1)
@@ -4897,10 +4859,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_movk_i32 s32, 0x400
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -4912,10 +4874,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-NEXT: s_mov_b32 s32, 16
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b8 off, v0, off
; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4
@@ -4939,10 +4901,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0
; HSA-NEXT: s_movk_i32 s32, 0x400
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_waitcnt vmcnt(1)
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT: s_waitcnt vmcnt(1)
@@ -4976,10 +4938,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; VI-NEXT: s_movk_i32 s32, 0x800
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -5014,10 +4976,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; CI-NEXT: s_movk_i32 s32, 0x800
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
; CI-NEXT: s_waitcnt vmcnt(1)
@@ -5053,10 +5015,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -5078,10 +5040,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-NEXT: s_mov_b32 s32, 32
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b8 off, v0, off
; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4
@@ -5118,11 +5080,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0
; HSA-NEXT: s_movk_i32 s32, 0x800
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: s_waitcnt vmcnt(1)
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT: s_waitcnt vmcnt(1)
@@ -5172,11 +5134,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_addc_u32 s37, s37, 0
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v0
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0
@@ -5214,11 +5176,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_addc_u32 s37, s37, 0
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v16, 8, v0
; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v0
@@ -5256,11 +5218,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0
@@ -5288,10 +5250,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
+; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
@@ -5323,15 +5285,15 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; HSA-NEXT: s_mov_b32 s11, 0x1100f000
; HSA-NEXT: s_mov_b32 s10, -1
-; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_addc_u32 s1, s1, 0
+; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7
; HSA-NEXT: s_waitcnt lgkmcnt(0)
; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_getpc_b64 s[8:9]
; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i8 at rel32@lo+4
; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i8 at rel32@hi+12
+; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: v_lshrrev_b32_e32 v16, 8, v0
; HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0
@@ -5381,6 +5343,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[52:53]
; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
; VI-NEXT: s_mov_b64 s[2:3], s[54:55]
; VI-NEXT: v_mov_b32_e32 v0, s36
; VI-NEXT: v_mov_b32_e32 v1, s37
@@ -5413,9 +5378,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; VI-NEXT: v_mov_b32_e32 v28, s20
; VI-NEXT: v_mov_b32_e32 v29, s21
; VI-NEXT: v_mov_b32_e32 v30, s22
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: s_endpgm
;
@@ -5440,6 +5402,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[52:53]
; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[54:55]
; CI-NEXT: v_mov_b32_e32 v0, s36
; CI-NEXT: v_mov_b32_e32 v1, s37
@@ -5472,9 +5437,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; CI-NEXT: v_mov_b32_e32 v28, s20
; CI-NEXT: v_mov_b32_e32 v29, s21
; CI-NEXT: v_mov_b32_e32 v30, s22
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
@@ -5499,6 +5461,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-NEXT: v_mov_b32_e32 v0, s36
; GFX9-NEXT: v_mov_b32_e32 v1, s37
@@ -5531,9 +5496,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; GFX9-NEXT: v_mov_b32_e32 v28, s20
; GFX9-NEXT: v_mov_b32_e32 v29, s21
; GFX9-NEXT: v_mov_b32_e32 v30, s22
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -5569,11 +5531,10 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; GFX11-NEXT: v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14
; GFX11-NEXT: v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16
; GFX11-NEXT: v_mov_b32_e32 v30, s18
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -5595,6 +5556,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
; HSA-NEXT: v_mov_b32_e32 v0, s25
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; HSA-NEXT: s_getpc_b64 s[24:25]
+; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg at rel32@hi+12
; HSA-NEXT: s_mov_b64 s[6:7], s[4:5]
; HSA-NEXT: v_mov_b32_e32 v0, s36
; HSA-NEXT: v_mov_b32_e32 v1, s37
@@ -5627,9 +5591,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; HSA-NEXT: v_mov_b32_e32 v28, s20
; HSA-NEXT: v_mov_b32_e32 v29, s21
; HSA-NEXT: v_mov_b32_e32 v30, s22
-; HSA-NEXT: s_getpc_b64 s[24:25]
-; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25]
; HSA-NEXT: s_endpgm
entry:
@@ -5835,6 +5796,9 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 15
; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 0
@@ -5867,9 +5831,6 @@ define void @stack_12xv3i32() #0 {
; VI-NEXT: v_mov_b32_e32 v29, 9
; VI-NEXT: v_mov_b32_e32 v30, 10
; VI-NEXT: v_writelane_b32 v40, s31, 1
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
@@ -5903,6 +5864,9 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 15
; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
@@ -5935,9 +5899,6 @@ define void @stack_12xv3i32() #0 {
; CI-NEXT: v_mov_b32_e32 v29, 9
; CI-NEXT: v_mov_b32_e32 v30, 10
; CI-NEXT: v_writelane_b32 v40, s31, 1
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
@@ -5971,6 +5932,9 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
@@ -6003,9 +5967,6 @@ define void @stack_12xv3i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 9
; GFX9-NEXT: v_mov_b32_e32 v30, 10
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -6051,12 +6012,12 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v27, 9 :: v_dual_mov_b32 v26, 8
; GFX11-NEXT: v_dual_mov_b32 v29, 9 :: v_dual_mov_b32 v28, 9
; GFX11-NEXT: v_mov_b32_e32 v30, 10
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
@@ -6089,6 +6050,9 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 15
; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; HSA-NEXT: s_getpc_b64 s[4:5]
+; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: v_mov_b32_e32 v2, 0
@@ -6121,9 +6085,6 @@ define void @stack_12xv3i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v29, 9
; HSA-NEXT: v_mov_b32_e32 v30, 10
; HSA-NEXT: v_writelane_b32 v40, s31, 1
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
@@ -6174,6 +6135,9 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 0x41700000
; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 0
@@ -6206,9 +6170,6 @@ define void @stack_12xv3f32() #0 {
; VI-NEXT: v_mov_b32_e32 v29, 0x41100000
; VI-NEXT: v_mov_b32_e32 v30, 0x41200000
; VI-NEXT: v_writelane_b32 v40, s31, 1
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
@@ -6242,6 +6203,9 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 0x41700000
; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
@@ -6274,9 +6238,6 @@ define void @stack_12xv3f32() #0 {
; CI-NEXT: v_mov_b32_e32 v29, 0x41100000
; CI-NEXT: v_mov_b32_e32 v30, 0x41200000
; CI-NEXT: v_writelane_b32 v40, s31, 1
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
@@ -6310,6 +6271,9 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
@@ -6342,9 +6306,6 @@ define void @stack_12xv3f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000
; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -6394,12 +6355,12 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v27, 0x41100000 :: v_dual_mov_b32 v28, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v29, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v30, 0x41200000
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
@@ -6432,6 +6393,9 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000
; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; HSA-NEXT: s_getpc_b64 s[4:5]
+; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: v_mov_b32_e32 v2, 0
@@ -6464,9 +6428,6 @@ define void @stack_12xv3f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000
; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000
; HSA-NEXT: v_writelane_b32 v40, s31, 1
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
@@ -6525,6 +6486,9 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 15
; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 0
@@ -6557,9 +6521,6 @@ define void @stack_8xv5i32() #0 {
; VI-NEXT: v_mov_b32_e32 v29, 5
; VI-NEXT: v_mov_b32_e32 v30, 6
; VI-NEXT: v_writelane_b32 v40, s31, 1
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
@@ -6601,6 +6562,9 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 15
; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
@@ -6633,9 +6597,6 @@ define void @stack_8xv5i32() #0 {
; CI-NEXT: v_mov_b32_e32 v29, 5
; CI-NEXT: v_mov_b32_e32 v30, 6
; CI-NEXT: v_writelane_b32 v40, s31, 1
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
@@ -6677,6 +6638,9 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
@@ -6709,9 +6673,6 @@ define void @stack_8xv5i32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 5
; GFX9-NEXT: v_mov_b32_e32 v30, 6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -6762,12 +6723,12 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5
; GFX11-NEXT: v_mov_b32_e32 v28, 5
; GFX11-NEXT: v_mov_b32_e32 v30, 6
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
@@ -6808,6 +6769,9 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 15
; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; HSA-NEXT: s_getpc_b64 s[4:5]
+; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: v_mov_b32_e32 v2, 0
@@ -6840,9 +6804,6 @@ define void @stack_8xv5i32() #0 {
; HSA-NEXT: v_mov_b32_e32 v29, 5
; HSA-NEXT: v_mov_b32_e32 v30, 6
; HSA-NEXT: v_writelane_b32 v40, s31, 1
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
@@ -6897,6 +6858,9 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: v_mov_b32_e32 v0, 0x41700000
; VI-NEXT: v_writelane_b32 v40, s30, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; VI-NEXT: s_getpc_b64 s[4:5]
+; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 0
@@ -6929,9 +6893,6 @@ define void @stack_8xv5f32() #0 {
; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000
; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000
; VI-NEXT: v_writelane_b32 v40, s31, 1
-; VI-NEXT: s_getpc_b64 s[4:5]
-; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
-; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; VI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; VI-NEXT: v_readlane_b32 s31, v40, 1
; VI-NEXT: v_readlane_b32 s30, v40, 0
@@ -6973,6 +6934,9 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: v_mov_b32_e32 v0, 0x41700000
; CI-NEXT: v_writelane_b32 v40, s30, 0
; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; CI-NEXT: s_getpc_b64 s[4:5]
+; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
@@ -7005,9 +6969,6 @@ define void @stack_8xv5f32() #0 {
; CI-NEXT: v_mov_b32_e32 v29, 0x40a00000
; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000
; CI-NEXT: v_writelane_b32 v40, s31, 1
-; CI-NEXT: s_getpc_b64 s[4:5]
-; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
-; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: v_readlane_b32 s31, v40, 1
; CI-NEXT: v_readlane_b32 s30, v40, 0
@@ -7049,6 +7010,9 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
@@ -7081,9 +7045,6 @@ define void @stack_8xv5f32() #0 {
; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000
; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
@@ -7137,12 +7098,12 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000
; GFX11-NEXT: v_dual_mov_b32 v29, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000
; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
@@ -7183,6 +7144,9 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000
; HSA-NEXT: v_writelane_b32 v40, s30, 0
; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
+; HSA-NEXT: s_getpc_b64 s[4:5]
+; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; HSA-NEXT: v_mov_b32_e32 v0, 0
; HSA-NEXT: v_mov_b32_e32 v1, 0
; HSA-NEXT: v_mov_b32_e32 v2, 0
@@ -7215,9 +7179,6 @@ define void @stack_8xv5f32() #0 {
; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000
; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000
; HSA-NEXT: v_writelane_b32 v40, s31, 1
-; HSA-NEXT: s_getpc_b64 s[4:5]
-; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
-; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5]
; HSA-NEXT: v_readlane_b32 s31, v40, 1
; HSA-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index 3626b2b316fba7..b8804a8636ef08 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -72,11 +72,11 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: v_mov_b32_e32 v31, v0
-; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
+; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
call void @callee()
@@ -111,11 +111,11 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: v_mov_b32_e32 v31, 0
-; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v31, 0
+; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
call void @callee()
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 10f0efea59b607..15152409eacf9e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -13,11 +13,11 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: ds_read_b32 v0, v0
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func at rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func at rel32@hi+12
+; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_endpgm
%vgpr = load volatile i32, ptr addrspace(3) %ptr
@@ -33,16 +33,16 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_getpc_b64 s[8:9]
+; GCN-NEXT: s_add_u32 s8, s8, func at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s9, s9, func at rel32@hi+12
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v0, v0, s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_getpc_b64 s[8:9]
-; GCN-NEXT: s_add_u32 s8, s8, func at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s9, s9, func at rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_endpgm
store i32 0, ptr addrspace(1) %ptr
@@ -55,16 +55,16 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
; GCN-LABEL: call_no_wait_after_call:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func at rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func at rel32@hi+12
+; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
@@ -78,16 +78,16 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func.return at rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func.return at rel32@hi+12
+; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index c511f88aeaf86c..fc24041fe771a6 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -69,20 +69,20 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
-; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT: s_mov_b32 s14, s16
-; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
+; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX803-NEXT: s_mov_b32 s14, s16
+; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_endpgm
;
@@ -91,17 +91,17 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
+; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
-; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT: s_mov_b32 s14, s16
-; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
+; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX900-NEXT: s_mov_b32 s14, s16
+; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_endpgm
;
@@ -119,10 +119,10 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
+; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
@@ -132,14 +132,14 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT: s_getpc_b64 s[16:17]
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
-; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
@@ -153,23 +153,23 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: v_mov_b32_e32 v3, 0
+; GFX803-NEXT: s_getpc_b64 s[18:19]
+; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_endpgm
;
@@ -178,20 +178,20 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
+; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-NEXT: s_getpc_b64 s[18:19]
+; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_movk_i32 s32, 0x400
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_endpgm
;
@@ -210,12 +210,12 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s12, s14
-; GFX1010-NEXT: s_mov_b32 s14, s16
-; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0
-; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
+; GFX1010-NEXT: s_mov_b32 s14, s16
+; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
@@ -226,6 +226,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT: s_getpc_b64 s[16:17]
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
@@ -233,9 +236,6 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1100-NEXT: s_mov_b32 s32, 16
; GFX1100-NEXT: scratch_store_b32 off, v1, off dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
@@ -320,21 +320,21 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
+; GFX803-NEXT: s_getpc_b64 s[18:19]
+; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_mov_b32 s32, 0
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_endpgm
;
@@ -343,18 +343,18 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
+; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
+; GFX900-NEXT: s_getpc_b64 s[18:19]
+; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: s_mov_b32 s32, 0
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_endpgm
;
@@ -373,10 +373,10 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
+; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
@@ -386,15 +386,15 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT: s_getpc_b64 s[16:17]
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_mov_b32 s32, 0
-; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
; GFX1010-NEXT s_add_u32 s12, s12, s17
@@ -426,24 +426,24 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: v_mov_b32_e32 v3, 0
+; GFX803-NEXT: s_getpc_b64 s[18:19]
+; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
+; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: s_getpc_b64 s[18:19]
-; GFX803-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_endpgm
;
@@ -452,21 +452,21 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
+; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_mov_b32 s33, 0
-; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-NEXT: s_getpc_b64 s[18:19]
+; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
+; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_movk_i32 s32, 0x400
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: s_getpc_b64 s[18:19]
-; GFX900-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_endpgm
;
@@ -486,12 +486,12 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s12, s14
-; GFX1010-NEXT: s_mov_b32 s14, s16
-; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33
-; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex at rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex at rel32@hi+12
+; GFX1010-NEXT: s_mov_b32 s14, s16
+; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33
+; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
;
@@ -503,6 +503,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1100-NEXT: s_mov_b32 s12, s13
; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT: s_getpc_b64 s[16:17]
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1100-NEXT: s_mov_b32 s13, s14
@@ -510,9 +513,6 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1100-NEXT: s_mov_b32 s32, 16
; GFX1100-NEXT: scratch_store_b32 off, v1, s33 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 5e6152661aeec4..e8911472138826 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -35,10 +35,10 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v2f32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32 at rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -71,10 +71,10 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v3f32 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32 at rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -107,10 +107,10 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v4f16 at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16 at rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -143,10 +143,10 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
-; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_struct at rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_struct at rel32@hi+12
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
@@ -189,16 +189,16 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-NEXT: s_cbranch_vccnz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s8, 8
+; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_getpc_b64 s[18:19]
+; GCN-NEXT: s_add_u32 s18, s18, func_v3i16 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16 at rel32@hi+12
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func_v3i16 at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16 at rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_branch .LBB4_3
; GCN-NEXT: .LBB4_2:
@@ -240,16 +240,16 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-NEXT: s_cbranch_vccnz .LBB5_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s8, 8
+; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_getpc_b64 s[18:19]
+; GCN-NEXT: s_add_u32 s18, s18, func_v3f16 at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16 at rel32@hi+12
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, func_v3f16 at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16 at rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_branch .LBB5_3
; GCN-NEXT: .LBB5_2:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 36aa73fbf8e92a..65c9be59b17327 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -2917,8 +2917,8 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
@@ -5751,8 +5751,8 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index d96d3db9f005df..cc32b9bb9100f8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -2917,8 +2917,8 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
@@ -5751,8 +5751,8 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 3922b5404d7786..61f5c49b6c9071 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -19,12 +19,12 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0
; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1
-; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
-; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5]
; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1
+; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0
@@ -62,11 +62,11 @@ define void @callee_with_stack_and_call() #0 {
; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5]
; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
-; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec
; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index fbe06b3651b06c..15be44a335a1d0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1267,16 +1267,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
@@ -1487,16 +1487,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -2487,16 +2487,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
@@ -2737,16 +2737,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -4543,16 +4543,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
@@ -4793,16 +4793,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5987,19 +5987,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s42, s9
; GFX1032-NEXT: s_mov_b32 s9, exec_lo
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-NEXT: s_add_u32 s48, s48, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-NEXT: s_mov_b32 s46, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
@@ -6446,19 +6446,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s42, s9
; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -7692,8 +7692,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
@@ -8122,16 +8122,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
@@ -8379,16 +8379,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -9217,8 +9217,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
@@ -9555,16 +9555,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
@@ -9812,16 +9812,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -10650,8 +10650,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
@@ -11565,8 +11565,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
@@ -13748,8 +13748,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 77924222919984..a4410bb9ed2d04 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3366,17 +3366,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s11
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-NEXT: s_mov_b32 s46, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
@@ -3806,17 +3806,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
@@ -6469,8 +6469,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -6914,17 +6914,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s11
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-NEXT: s_mov_b32 s46, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
@@ -7354,17 +7354,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8642,8 +8642,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index cb3291df891af4..68d7dcc60506c1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3366,17 +3366,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s11
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-NEXT: s_mov_b32 s46, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
@@ -3806,17 +3806,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
@@ -6469,8 +6469,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -6914,17 +6914,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s11
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-NEXT: s_mov_b32 s46, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
@@ -7354,17 +7354,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8642,8 +8642,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6dc3a1971a485f..7126680525b879 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1379,16 +1379,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
@@ -1629,16 +1629,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -2711,16 +2711,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
@@ -2961,16 +2961,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -4871,16 +4871,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
@@ -5121,16 +5121,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -6315,19 +6315,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s42, s9
; GFX1032-NEXT: s_mov_b32 s9, exec_lo
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-NEXT: s_add_u32 s48, s48, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-NEXT: s_mov_b32 s46, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
@@ -6774,19 +6774,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s42, s9
; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8020,8 +8020,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
@@ -8450,16 +8450,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
@@ -8707,16 +8707,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -9544,8 +9544,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
@@ -9882,16 +9882,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
@@ -10139,16 +10139,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -10977,8 +10977,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
@@ -11892,8 +11892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
@@ -14074,8 +14074,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s46, 0
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index c826980991f94f..acb706cee04d06 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -54,6 +54,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s33, s16
; CHECK-NEXT: s_addc_u32 s45, s35, 0
; CHECK-NEXT: s_mov_b32 s43, s14
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: s_mov_b32 s13, s15
@@ -62,14 +65,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v45, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v43, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
@@ -77,13 +80,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v41, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
@@ -92,26 +95,23 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: global_load_dword v0, v0, s[52:53]
+; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj at rel32@hi+12
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
; CHECK-NEXT: v_mov_b32_e32 v1, 12
@@ -190,6 +190,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
@@ -197,9 +200,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
@@ -215,6 +215,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
@@ -223,9 +226,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v60
@@ -241,6 +241,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
@@ -249,9 +252,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v60
@@ -267,6 +267,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
@@ -275,9 +278,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
@@ -319,6 +319,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
@@ -326,9 +329,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
@@ -356,15 +356,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b32 s4, exec_lo
@@ -381,15 +381,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
@@ -439,16 +439,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0
; CHECK-NEXT: v_and_b32_e32 v1, 15, v1
; CHECK-NEXT: s_addc_u32 s9, s35, 0
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj at rel32@hi+12
; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
-; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0
@@ -500,15 +500,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v2, v44
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_branch .LBB0_27
; CHECK-NEXT: .LBB0_33:
@@ -803,6 +803,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s33, s16
; CHECK-NEXT: s_addc_u32 s45, s39, 0
; CHECK-NEXT: s_mov_b32 s43, s14
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: s_mov_b32 s13, s15
@@ -811,14 +814,14 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v43, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
@@ -826,13 +829,13 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
@@ -841,27 +844,24 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
-; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: global_load_dword v0, v0, s[46:47]
+; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj at rel32@hi+12
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
; CHECK-NEXT: v_mov_b32_e32 v1, 12
@@ -945,6 +945,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s38, 40
; CHECK-NEXT: s_addc_u32 s9, s39, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
@@ -952,9 +955,6 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v47
@@ -982,15 +982,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_add_u32 s8, s38, 40
; CHECK-NEXT: s_addc_u32 s9, s39, 0
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s43
; CHECK-NEXT: s_mov_b32 s13, s42
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj at rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_endpgm
.5:
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 6f841c88a6d8bb..f60786c1bacbff 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -4,8 +4,8 @@
define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_then:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
; GCN-NEXT: v_mov_b32_e32 v3, 1
@@ -60,8 +60,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_else_vgpr_opt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
; GCN-NEXT: v_mov_b32_e32 v3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 715ea57d473f5b..0501602bbd8f43 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1023,8 +1023,8 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
;
; GFX10-WAVE32-LABEL: test_kill_divergent_loop:
; GFX10-WAVE32: ; %bb.0: ; %entry
-; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 5641c43c40084c..d4d3b37a0ed1e9 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -21,10 +21,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
; MUBUF-NEXT: v_mov_b32_e32 v3, 0
; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000
-; MUBUF-NEXT: s_mov_b32 s32, 0xc0000
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes at rel32@hi+12
+; MUBUF-NEXT: s_mov_b32 s32, 0xc0000
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v0, s0
; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37]
@@ -85,10 +85,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000
; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0
; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000
-; MUBUF11-NEXT: s_movk_i32 s32, 0x6000
; MUBUF11-NEXT: s_getpc_b64 s[0:1]
; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
+; MUBUF11-NEXT: s_movk_i32 s32, 0x6000
; MUBUF11-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF11-NEXT: v_mov_b32_e32 v0, s2
; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -112,10 +112,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000
; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0
; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000
-; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000
; FLATSCR11-NEXT: s_getpc_b64 s[0:1]
; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
+; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000
; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2
; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll
index 4f33e19835172a..5917522f2bfa0b 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll
@@ -25,17 +25,20 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a
; CHECK-NEXT: s_getpc_b64 s[18:19]
; CHECK-NEXT: s_add_u32 s18, s18, global at rel32@lo+1948
; CHECK-NEXT: s_addc_u32 s19, s19, global at rel32@hi+1956
-; CHECK-NEXT: v_mov_b32_e32 v5, 0
-; CHECK-NEXT: v_mov_b32_e32 v0, s18
-; CHECK-NEXT: v_mov_b32_e32 v1, s19
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, eggs at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, eggs at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v5, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s18
+; CHECK-NEXT: v_mov_b32_e32 v1, s19
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: .LBB0_3: ; %LeafBlock1
; CHECK-NEXT: s_cbranch_scc0 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb8
; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: s_getpc_b64 s[16:17]
+; CHECK-NEXT: s_add_u32 s16, s16, quux at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s17, s17, quux at rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v1, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v6
; CHECK-NEXT: v_mov_b32_e32 v3, v7
@@ -47,9 +50,6 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a
; CHECK-NEXT: v_mov_b32_e32 v9, v13
; CHECK-NEXT: v_mov_b32_e32 v10, v14
; CHECK-NEXT: v_mov_b32_e32 v11, v15
-; CHECK-NEXT: s_getpc_b64 s[16:17]
-; CHECK-NEXT: s_add_u32 s16, s16, quux at rel32@lo+4
-; CHECK-NEXT: s_addc_u32 s17, s17, quux at rel32@hi+12
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: .LBB0_5: ; %bb9
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
index 0689c0585d8a69..80dae9142870a2 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll
@@ -7,10 +7,10 @@ define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) {
; CHECK-LABEL: tail_call_i32_inreg_uniform:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s0, s16
; CHECK-NEXT: s_getpc_b64 s[18:19]
; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg at rel32@hi+12
+; CHECK-NEXT: s_mov_b32 s0, s16
; CHECK-NEXT: s_setpc_b64 s[18:19]
tail call void @void_func_i32_inreg(i32 inreg %sgpr)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 12eec4fa3bd594..dd78c2f46dde8a 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -290,6 +290,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
+; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
@@ -297,9 +300,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
-; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
-; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0
; GLOBALNESS1-NEXT: .LBB1_32: ; %Flow
@@ -308,6 +308,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
+; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35]
@@ -315,9 +318,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s71
; GLOBALNESS1-NEXT: s_mov_b32 s14, s70
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17]
-; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
-; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GLOBALNESS1-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock
;
@@ -582,6 +582,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
+; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
@@ -589,9 +592,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
-; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
-; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0
; GLOBALNESS0-NEXT: .LBB1_32: ; %Flow
@@ -600,6 +600,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
+; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
+; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41]
; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37]
; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35]
@@ -607,9 +610,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s69
; GLOBALNESS0-NEXT: s_mov_b32 s14, s68
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17]
-; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget at rel32@lo+4
-; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget at rel32@hi+12
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GLOBALNESS0-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index a0bce3432a4bd0..c0b56d05f72aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -161,16 +161,16 @@ for.end:
define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
; SI-LABEL: loop:
; SI: ; %bb.0: ; %main_body
+; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: s_mov_b32 s15, 0x31c16000
; SI-NEXT: s_add_u32 s12, s12, s1
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_mov_b32 s32, 0
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s6, exec_lo, s0
@@ -243,11 +243,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: v_mov_b32_e32 v40, v1
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_mov_b32 s15, 0x31c16000
; SI-NEXT: s_add_u32 s12, s12, s1
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_mov_b32 s32, 0
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s6, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4e17be1ebb312e..0acf10d46ae819 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -372,8 +372,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: .LBB10_2: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
+; GFX1032-NEXT: s_mov_b32 s3, 0
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
; GFX1032-NEXT: ; %bb.3: ; %bb5
@@ -515,8 +515,8 @@ bb13:
define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_loop_with_if_else_break:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: ; %bb.1: ; %.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 1089093ea691c3..b73f2ed6e7c767 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -416,10 +416,10 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 08cc2e4ec7d794..ddc50b7d495047 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -426,12 +426,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
-; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
-; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: s_getpc_b64 s[22:23]
; GFX9-O3-NEXT: s_add_u32 s22, s22, called at rel32@lo+4
; GFX9-O3-NEXT: s_addc_u32 s23, s23, called at rel32@hi+12
+; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6
@@ -1278,12 +1278,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
-; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
-; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: s_getpc_b64 s[22:23]
; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called at rel32@lo+4
; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6
diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
index f367a5626c8d3c..b92f03d43bb4c5 100644
--- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
+++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll
@@ -41,8 +41,8 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnon
; ENABLE-NEXT: bhs .LBB0_6
; ENABLE-NEXT: @ %bb.5: @ %while.body
; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1
-; ENABLE-NEXT: cmp r0, r2
; ENABLE-NEXT: mov r1, r3
+; ENABLE-NEXT: cmp r0, r2
; ENABLE-NEXT: blo .LBB0_4
; ENABLE-NEXT: .LBB0_6: @ %if.end29
; ENABLE-NEXT: pop {r11, pc}
@@ -131,8 +131,8 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnon
; DISABLE-NEXT: bhs .LBB0_6
; DISABLE-NEXT: @ %bb.5: @ %while.body
; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1
-; DISABLE-NEXT: cmp r0, r2
; DISABLE-NEXT: mov r1, r3
+; DISABLE-NEXT: cmp r0, r2
; DISABLE-NEXT: blo .LBB0_4
; DISABLE-NEXT: .LBB0_6: @ %if.end29
; DISABLE-NEXT: pop {r11, pc}
diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
index 75416475289f31..5b2d0a8a7c059c 100644
--- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -2017,8 +2017,8 @@ define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %t
; ARM-DISABLE-NEXT: sub r4, sp, #24
; ARM-DISABLE-NEXT: bfc r4, #0, #4
; ARM-DISABLE-NEXT: mov sp, r4
-; ARM-DISABLE-NEXT: tst r2, #1
; ARM-DISABLE-NEXT: vst1.64 {d8, d9}, [r4:128]
+; ARM-DISABLE-NEXT: tst r2, #1
; ARM-DISABLE-NEXT: vstr d10, [r4, #16]
; ARM-DISABLE-NEXT: beq LBB12_2
; ARM-DISABLE-NEXT: @ %bb.1: @ %bb3
@@ -2123,8 +2123,8 @@ define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %t
; THUMB-DISABLE-NEXT: sub.w r4, sp, #24
; THUMB-DISABLE-NEXT: bfc r4, #0, #4
; THUMB-DISABLE-NEXT: mov sp, r4
-; THUMB-DISABLE-NEXT: lsls r1, r2, #31
; THUMB-DISABLE-NEXT: vst1.64 {d8, d9}, [r4:128]
+; THUMB-DISABLE-NEXT: lsls r1, r2, #31
; THUMB-DISABLE-NEXT: vstr d10, [r4, #16]
; THUMB-DISABLE-NEXT: beq LBB12_2
; THUMB-DISABLE-NEXT: @ %bb.1: @ %bb3
diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
index 7bc7b844396277..920742fffcf1e6 100644
--- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -50,9 +50,9 @@ define void @test_pr22678() {
define <4 x i32> @test_vmovrrd_combine() nounwind {
; CHECK-LABEL: test_vmovrrd_combine:
; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ implicit-def: $q8
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: cmp r0, #0
-; CHECK-NEXT: @ implicit-def: $q8
; CHECK-NEXT: bne .LBB3_2
; CHECK-NEXT: @ %bb.1: @ %bb1.preheader
; CHECK-NEXT: vmov.i32 q8, #0x0
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
index ac9641ff35b0cb..c36b3bfd502286 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
@@ -54,13 +54,13 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
; CHECK-NEXT: # implicit-def: $r4
; CHECK-NEXT: .LBB0_8: # %bb20
; CHECK-NEXT: mfcr r12
-; CHECK-NEXT: cmpwi cr2, r3, -1
; CHECK-NEXT: cmpwi cr3, r4, -1
+; CHECK-NEXT: cmpwi cr2, r3, -1
; CHECK-NEXT: stw r12, 8(r1)
; CHECK-NEXT: cmpwi cr7, r3, 0
; CHECK-NEXT: cmpwi cr6, r4, 0
-; CHECK-NEXT: crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt
; CHECK-NEXT: crand 4*cr5+lt, 4*cr3+gt, 4*cr5+un
+; CHECK-NEXT: crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt
; CHECK-NEXT: # implicit-def: $x3
; CHECK-NEXT: bc 4, 4*cr5+gt, .LBB0_10
; CHECK-NEXT: # %bb.9: # %bb34
@@ -95,15 +95,15 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
; CHECK-NEXT: lwz r7, 0(r3)
; CHECK-NEXT: .LBB0_18: # %bb58
; CHECK-NEXT: lwz r6, 92(r6)
+; CHECK-NEXT: cmpwi cr4, r7, 1
; CHECK-NEXT: crand 4*cr7+un, 4*cr3+gt, 4*cr6+un
; CHECK-NEXT: cmpwi cr3, r5, 1
-; CHECK-NEXT: cmpwi cr4, r7, 1
; CHECK-NEXT: crand 4*cr7+gt, 4*cr7+eq, 4*cr1+lt
; CHECK-NEXT: # implicit-def: $x5
; CHECK-NEXT: crand 4*cr6+un, 4*cr2+eq, 4*cr6+un
; CHECK-NEXT: crand 4*cr5+un, 4*cr6+eq, 4*cr5+un
-; CHECK-NEXT: crand 4*cr6+gt, 4*cr3+lt, 4*cr6+gt
; CHECK-NEXT: crand 4*cr7+lt, 4*cr4+lt, 4*cr7+lt
+; CHECK-NEXT: crand 4*cr6+gt, 4*cr3+lt, 4*cr6+gt
; CHECK-NEXT: cmpwi r6, 1
; CHECK-NEXT: crand 4*cr6+lt, lt, 4*cr6+lt
; CHECK-NEXT: bc 4, 4*cr6+gt, .LBB0_20
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 6cb98557c9bc13..ca1f1036ea6740 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -24,8 +24,8 @@ define arm_aapcs_vfpcc void @fast_float_mul(ptr nocapture %a, ptr nocapture read
; CHECK-NEXT: cmpeq.w r12, #0
; CHECK-NEXT: beq .LBB0_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: subs r4, r3, #1
; CHECK-NEXT: and r12, r3, #3
+; CHECK-NEXT: subs r4, r3, #1
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: bhs .LBB0_6
; CHECK-NEXT: @ %bb.3:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
index 44cbd7d65125ea..9c36bae6fac13f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
@@ -10,8 +10,8 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
; CHECK-NEXT: subs.w r9, r1, #1
; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: subs r7, r1, #2
; CHECK-NEXT: and r8, r9, #3
+; CHECK-NEXT: subs r7, r1, #2
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index c0bc34c2b06efc..7c6c7e90413b17 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1411,8 +1411,8 @@ define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocap
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cbz r2, .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r12, r2, #3
+; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB9_4
; CHECK-NEXT: @ %bb.2:
@@ -1566,8 +1566,8 @@ define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocap
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cbz r2, .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r12, r2, #3
+; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB10_4
; CHECK-NEXT: @ %bb.2:
@@ -1721,8 +1721,8 @@ define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr noca
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: cbz r2, .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r12, r2, #3
+; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB11_4
; CHECK-NEXT: @ %bb.2:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 715f6565199ec9..8a5a15a57912c9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -348,8 +348,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
; CHECK-NEXT: cmpeq r7, #0
; CHECK-NEXT: beq .LBB5_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: and r12, r4, #3
+; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB5_6
; CHECK-NEXT: @ %bb.3:
@@ -624,8 +624,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
; CHECK-NEXT: cmpeq r7, #0
; CHECK-NEXT: beq .LBB7_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: and r12, r4, #3
+; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB7_6
; CHECK-NEXT: @ %bb.3:
@@ -900,8 +900,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
; CHECK-NEXT: cmpeq r7, #0
; CHECK-NEXT: beq .LBB9_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: and r12, r4, #3
+; CHECK-NEXT: subs r7, r4, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB9_6
; CHECK-NEXT: @ %bb.3:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 3c4af10b124386..6f986ce28381bc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -446,8 +446,8 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(ptr nocapture read
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q0, r3
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w r3, r3, r7, lsr #2
; CHECK-NEXT: vmov.32 q0[0], r12
+; CHECK-NEXT: add.w r3, r3, r7, lsr #2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB6_5: @ %vector.body46
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 495ffe809f70fe..eb52b5ab9fd171 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1060,12 +1060,12 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: vfma.f32 q0, q4, r5
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
; CHECK-NEXT: vfma.f32 q0, q5, r6
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: vfma.f32 q0, q2, lr
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT: vfma.f32 q0, q2, lr
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q3, r11
-; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r8
+; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_9
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
@@ -1603,8 +1603,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
; CHECK-NEXT: .LBB19_3: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: ldrd r5, r11, [r9]
+; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: ldrd r8, r10, [r9, #8]
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index e63c62574dafbc..e8b49c1067379a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -1376,8 +1376,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: bne .LBB16_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
; CHECK-NEXT: cmp r9, r1
; CHECK-NEXT: bne .LBB16_2
; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 7b8b884576d13e..eedca2cd4a5d3a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -712,8 +712,8 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: vaddv.u32 r10, q4
-; CHECK-NEXT: cmp r2, r12
; CHECK-NEXT: mov r4, r2
+; CHECK-NEXT: cmp r2, r12
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
index 77980be9052072..652d25af02e7c9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -180,9 +180,9 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add.w r2, r9, r10
-; CHECK-NEXT: sub.w r5, r8, r9
; CHECK-NEXT: add.w r7, r1, r9, lsl #1
; CHECK-NEXT: add.w r2, r1, r2, lsl #1
+; CHECK-NEXT: sub.w r5, r8, r9
; CHECK-NEXT: dlstp.32 lr, r5
; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index 45bb70ec44b737..f90af3cc5ba24b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -258,11 +258,11 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: adds r0, r5, #2
; CHECK-NEXT: adds r2, r5, #1
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: mov r3, r9
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: mov r4, r10
; CHECK-NEXT: vmov q2, q0
@@ -618,13 +618,13 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, r0, #2
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: add.w r10, r0, #2
; CHECK-NEXT: add.w r11, r0, #1
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q2, q1
@@ -833,8 +833,8 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r11, r0, #2
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #1
+; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: mov r3, r8
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q4, q1
@@ -1068,8 +1068,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #2
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: add.w r8, r0, #1
+; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: mov r3, r9
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vmov q5, q2
@@ -1347,11 +1347,11 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q3, #0x0
-; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: adds r4, r0, #3
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: adds r4, r0, #3
; CHECK-NEXT: add.w r8, r0, #2
; CHECK-NEXT: adds r1, r0, #1
+; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q6, q3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
index dd63b8564bdb4f..096d4382d2c351 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
@@ -100,8 +100,8 @@ define void @arm_cmplx_dot_prod_q15(ptr nocapture readonly %pSrcA, ptr nocapture
; CHECK-NEXT: ldr.w r8, [sp, #36]
; CHECK-NEXT: mov r6, r12
; CHECK-NEXT: mov r5, r7
-; CHECK-NEXT: and r2, r2, #3
; CHECK-NEXT: lsrl r6, r5, #6
+; CHECK-NEXT: and r2, r2, #3
; CHECK-NEXT: wls lr, r2, .LBB1_7
; CHECK-NEXT: .LBB1_5: @ %while.body11
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index c03339b52f2643..cba0f9cbba2cae 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -708,14 +708,14 @@ define ptr @signext(ptr %input_row, ptr %input_col, i16 zeroext %output_ch, i16
; CHECK-NEXT: mov r6, r12
; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23
; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT: add.w r0, r8, r10
; CHECK-NEXT: ldr r1, [sp, #100]
+; CHECK-NEXT: add.w r0, r8, r10
; CHECK-NEXT: add r0, r6
; CHECK-NEXT: add r0, r12
; CHECK-NEXT: strb.w r0, [r1, r11]
; CHECK-NEXT: add.w r11, r11, #1
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: cmp r11, r0
; CHECK-NEXT: beq .LBB5_8
; CHECK-NEXT: .LBB5_5: @ %for.body
@@ -933,14 +933,14 @@ define ptr @signext_optsize(ptr %input_row, ptr %input_col, i16 zeroext %output_
; CHECK-NEXT: mov r6, r12
; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23
; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT: add.w r0, r8, r10
; CHECK-NEXT: ldr r1, [sp, #100]
+; CHECK-NEXT: add.w r0, r8, r10
; CHECK-NEXT: add r0, r6
; CHECK-NEXT: add r0, r12
; CHECK-NEXT: strb.w r0, [r1, r11]
; CHECK-NEXT: add.w r11, r11, #1
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: cmp r11, r0
; CHECK-NEXT: bne .LBB6_3
; CHECK-NEXT: .LBB6_8: @ %if.end
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
index 9400f24e7192c8..723cbff1ab9d37 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
@@ -11,8 +11,8 @@ define void @arm_min_helium_f32(ptr %pSrc, i32 %blockSize, ptr nocapture %pResul
; CHECK-NEXT: vidup.u32 q2, r4, #1
; CHECK-NEXT: movw r5, #54437
; CHECK-NEXT: movt r5, #21352
-; CHECK-NEXT: vdup.32 q1, r5
; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vdup.32 q1, r5
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB0_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
index 42a00b61b41830..244a96595eaece 100644
--- a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll
@@ -11,10 +11,10 @@ define arm_aapcs_vfpcc void @start12(ptr nocapture readonly %x, ptr nocapture re
; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: vmov r12, s0
-; CHECK-NEXT: subs r3, #12
; CHECK-NEXT: adds r0, #48
; CHECK-NEXT: adds r1, #48
; CHECK-NEXT: adds r2, #48
+; CHECK-NEXT: subs r3, #12
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll
index 03b769f256bc28..7bb0c74fb01631 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll
@@ -42,9 +42,9 @@ define hidden i32 @_Z1fiz(i32 %n, ...) local_unnamed_addr #0 {
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: add r0, sp, #28
; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: cmp r4, #1
; CHECK-NEXT: stm r0!, {r1, r2, r3}
; CHECK-NEXT: add r0, sp, #28
+; CHECK-NEXT: cmp r4, #1
; CHECK-NEXT: str r0, [sp, #4]
; CHECK-NEXT: blt .LBB0_2
; CHECK-NEXT: .LBB0_1: @ %for.body
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
index 5eb5990be7c118..e6fcf56af6e8db 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
@@ -46,8 +46,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 {
; CHECK-NEXT: cmp r5, #1
; CHECK-NEXT: blt .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: subs r0, r5, #1
; CHECK-NEXT: and r12, r5, #3
+; CHECK-NEXT: subs r0, r5, #1
; CHECK-NEXT: cmp r0, #3
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
diff --git a/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll b/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll
index 4e700ce493b353..e260286fb692a0 100644
--- a/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll
+++ b/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll
@@ -58,8 +58,8 @@ define void @double_foobar() {
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: movs r0, #2
; CHECK-NEXT: str r0, [r1]
-; CHECK-NEXT: add r1, sp, #4
; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: add r1, sp, #4
; CHECK-NEXT: ldr r0, [r1, #8]
; CHECK-NEXT: mov sp, r0
; CHECK-NEXT: ldr r0, [r1, #4]
>From 8b87e2ffcf597d7eba1cf3ff12bea1fae1f999e6 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Mon, 20 Jan 2025 08:03:11 +0300
Subject: [PATCH 2/5] Comment the check
---
llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index b2c3a0109b3a91..78e4ab19d3e53e 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -214,6 +214,14 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
unsigned OpIdx = MO.getOperandNo();
Register Reg = MO.getReg();
if (Reg.isPhysical()) {
+ // addPhysRegDataDeps uses the provided operand index to retrieve
+ // the operand use cycle from the scheduling model. If the operand
+ // is "fake" (e.g., an operand of a call instruction used to pass
+ // an argument to the called function.), the scheduling model may not
+ // have an entry for it. If this is the case, pass -1 as operand index,
+ // which will cause addPhysRegDataDeps to add an artificial dependency.
+ // FIXME: Using hasImplicitUseOfPhysReg here is inaccurate as it misses
+ // aliases. When fixing, make sure to update addPhysRegDataDeps, too.
bool IsRealUse = OpIdx < MIDesc.getNumOperands() ||
MIDesc.hasImplicitUseOfPhysReg(Reg);
for (MCRegUnit Unit : TRI->regunits(Reg))
@@ -267,6 +275,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
bool ImplicitPseudoUse = false;
SDep Dep;
if (UseOpIdx < 0) {
+ // FIXME: UseOpIdx can be passed to computeOperandLatency, which can
+ // pass it to findUseIdx, which treats it as unsigned. If this is
+ // the expected behavior, it should be commented.
Dep = SDep(SU, SDep::Artificial);
} else {
// Set the hasPhysRegDefs only for physreg defs that have a use within
>From d709f4b7013bd12c7a631bfe539ab90bfddd500e Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <s.barannikov at module.ru>
Date: Mon, 20 Jan 2025 09:57:08 +0300
Subject: [PATCH 3/5] Update remaining tests
---
.../AMDGPU/call-preserved-registers.ll | 52 +++++----
.../CodeGen/AMDGPU/call-reqd-group-size.ll | 105 ++++++++++++++++--
.../callee-special-input-sgprs-fixed-abi.ll | 5 +-
llvm/test/CodeGen/AMDGPU/sibling-call.ll | 6 +-
4 files changed, 130 insertions(+), 38 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index c7f9ff85806fc6..ff80e05197b0dd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -102,10 +102,10 @@ define hidden void @void_func_void_clobber_vcc() #2 {
}
; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc:
-; GCN: s_mov_b64 s[34:35], vcc
-; GCN-NEXT: s_getpc_b64
+; GCN: s_getpc_b64
; GCN-NEXT: s_add_u32
; GCN-NEXT: s_addc_u32
+; GCN: s_mov_b64 s[34:35], vcc
; GCN-NEXT: s_swappc_b64
; GCN: s_mov_b64 vcc, s[34:35]
define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 {
@@ -142,21 +142,27 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace
; FIXME: What is the expected behavior for reserved registers here?
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; GCN: #ASMSTART
-; GCN-NEXT: ; def s33
-; GCN-NEXT: #ASMEND
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+
+; GCN: #ASMSTART
+; GCN-NEXT: ; def s33
+; GCN-NEXT: #ASMEND
+
+; GCN-NOT: s33
+
+; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
+; MUBUF: s_swappc_b64 s[30:31], s[4:5]
+
+; GCN-NOT: s33
+
; GCN: ;;#ASMSTART
; GCN-NEXT: ; use s33
; GCN-NEXT: ;;#ASMEND
-; GCN-NOT: s33
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 {
%s33 = call i32 asm sideeffect "; def $0", "={s33}"()
@@ -168,20 +174,20 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
; GCN-NOT: s34
-; GCN: s_mov_b32 s32, 0
-
-; GCN-NOT: s34
-; GCN: ;;#ASMSTART
-; GCN-NEXT: ; def s34
-; GCN-NEXT: ;;#ASMEND
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GCN: s_mov_b32 s32, 0
+
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ; def s34
+; GCN-NEXT: ;;#ASMEND
; GCN-NOT: s34
+
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
@@ -200,19 +206,19 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
-; GCN-NOT: v32
-; GCN: s_mov_b32 s32, 0
-; GCN-NOT: v40
-
-; GCN: ;;#ASMSTART
-; GCN-NEXT: ; def v40
-; GCN-NEXT: ;;#ASMEND
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; GCN: s_mov_b32 s32, 0
+
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ; def v40
+; GCN-NEXT: ;;#ASMEND
+
+; GCN-NOT: v40
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
@@ -255,10 +261,10 @@ define hidden void @void_func_void_clobber_s34() #2 {
}
; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
-; GCN: s_mov_b32 s32, 0
; GCN: s_getpc_b64
; GCN-NEXT: s_add_u32
; GCN-NEXT: s_addc_u32
+; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
@@ -267,10 +273,10 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
}
; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34:
-; GCN: s_mov_b32 s32, 0
; GCN: s_getpc_b64
; GCN-NEXT: s_add_u32
; GCN-NEXT: s_addc_u32
+; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
index b8804a8636ef08..093ca55698fe36 100644
--- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s -check-prefix=GISEL
; Check for optimizing the passed implicit workitem ID based on the
; required group size. This should avoid a few bit packing operations.
@@ -13,15 +13,30 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
-; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
+; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
+;
+; GISEL-LABEL: known_x_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_lshlrev_b32_e32 v0, 20, v2
+; GISEL-NEXT: v_lshl_or_b32 v31, v1, 10, v0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @callee()
ret void
}
@@ -34,13 +49,27 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0
-; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0
+; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
+;
+; GISEL-LABEL: known_y_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_lshl_or_b32 v31, v2, 20, v0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @callee()
ret void
}
@@ -53,13 +82,27 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
-; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0
+; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
+;
+; GISEL-LABEL: known_z_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_lshl_or_b32 v31, v1, 10, v0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @callee()
ret void
}
@@ -79,6 +122,20 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
+;
+; GISEL-LABEL: known_yz_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @callee()
ret void
}
@@ -91,13 +148,27 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1
-; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1
+; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
+;
+; GISEL-LABEL: known_xz_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_lshlrev_b32_e32 v31, 10, v1
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @callee()
ret void
}
@@ -118,6 +189,20 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
+;
+; GISEL-LABEL: known_xyz_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, callee at rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, callee at rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v31, 0
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @callee()
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 609425329e106b..5d4db904fe6ea9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -198,11 +198,12 @@ define hidden void @use_workgroup_id_yz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s6
-; GCN: s_mov_b32 s32, 0
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
+; GCN-NOT: s6
+; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 5536a09538e6ee..2192ae6758102a 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -355,13 +355,13 @@ declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}}
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16
-; GCN: v_mov_b32_e32 v0, 0
-; GCN: v_mov_b32_e32 v30, 0
-
; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
; GCN-NEXT: s_add_u32
; GCN-NEXT: s_addc_u32
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v30, 0
; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]]
+
define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
entry:
%alloca = alloca [3 x i32], align 16, addrspace(5)
>From 806fa90e5df7cb38770b958894ff7e577637f603 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <s.barannikov at module.ru>
Date: Mon, 20 Jan 2025 11:06:48 +0300
Subject: [PATCH 4/5] Update LSR test
---
llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index 1614de8dbf5580..eb076776ee743b 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -23,9 +23,9 @@ define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind {
; A9-NEXT: add.w r4, lr, r2
; A9-NEXT: ldr.w r6, [lr, r2]
; A9-NEXT: add r0, r3
-; A9-NEXT: adds r3, r4, r2
-; A9-NEXT: add r0, r12
; A9-NEXT: ldr r5, [r4, r2]
+; A9-NEXT: add r0, r12
+; A9-NEXT: adds r3, r4, r2
; A9-NEXT: add r0, r6
; A9-NEXT: add r3, r2
; A9-NEXT: add r0, r5
>From b7cfe8be6ac732abbf7d49232337f0e6d909c50d Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88 at gmail.com>
Date: Mon, 20 Jan 2025 11:16:05 +0300
Subject: [PATCH 5/5] Try to improve FIXME
---
llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 78e4ab19d3e53e..75d970ae240404 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -275,9 +275,12 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
bool ImplicitPseudoUse = false;
SDep Dep;
if (UseOpIdx < 0) {
- // FIXME: UseOpIdx can be passed to computeOperandLatency, which can
- // pass it to findUseIdx, which treats it as unsigned. If this is
- // the expected behavior, it should be commented.
+ // FIXME: UseOpIdx passed to computeOperandLatency below should be
+ // non-negative. Currently a negative value is passed if UseOpIdx < 0
+ // and ImplicitPseudoDef is false. This could be fixed by setting
+ // ImplicitPseudoUse to true here (which is probably the right thing
+ // to do), but this crashes Hexagon backend and causes many test
+ // changes that need investigation.
Dep = SDep(SU, SDep::Artificial);
} else {
// Set the hasPhysRegDefs only for physreg defs that have a use within
More information about the llvm-commits
mailing list