[llvm] [AMDGPU] Stop adding implicit def of superreg in copyPhysReg (PR #125255)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 31 09:11:11 PST 2025
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/125255
Previously when copyPhysReg expanded a COPY into multiple MOV
instructions it added an implicit def of the destination superreg to the
first MOV. Removing these does not cause any liveness verification
problems and still passes Vulkan CTS for correctness testing.
>From 439e949f92fb281809b11675a27d189b7ae0cb55 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 31 Jan 2025 15:55:49 +0000
Subject: [PATCH] [AMDGPU] Stop adding implicit def of superreg in copyPhysReg
Previously when copyPhysReg expanded a COPY into multiple MOV
instructions it added an implicit def of the destination superreg to the
first MOV. Removing these does not cause any liveness verification
problems and still passes Vulkan CTS for correctness testing.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll | 6 +-
.../atomic_optimizations_mul_one.ll | 14 +-
.../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 16 +-
.../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 16 +-
.../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 94 +--
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 164 ++---
.../AMDGPU/GlobalISel/bitcast_38_i16.ll | 4 +-
.../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 20 +-
.../AMDGPU/GlobalISel/extractelement.i128.ll | 8 +-
.../AMDGPU/GlobalISel/extractelement.i16.ll | 20 +-
.../AMDGPU/GlobalISel/extractelement.i8.ll | 42 +-
.../AMDGPU/GlobalISel/extractelement.ll | 6 +-
.../GlobalISel/flat-scratch-init.gfx.ll | 6 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 4 +-
...licit-kernarg-backend-usage-global-isel.ll | 30 +-
.../AMDGPU/GlobalISel/insertelement.i16.ll | 166 ++---
.../AMDGPU/GlobalISel/insertelement.i8.ll | 74 +--
.../AMDGPU/GlobalISel/insertelement.ll | 562 ++++++++---------
.../GlobalISel/llvm.amdgcn.div.scale.ll | 12 +-
.../llvm.amdgcn.image.load.2darraymsaa.a16.ll | 12 +-
.../llvm.amdgcn.image.load.3d.a16.ll | 22 +-
.../GlobalISel/llvm.amdgcn.image.store.2d.ll | 4 +-
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 76 +--
.../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll | 7 +-
.../GlobalISel/llvm.amdgcn.update.dpp.ll | 4 +-
.../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 22 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 2 +-
.../AMDGPU/GlobalISel/regbankselect-mui.ll | 8 +-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 86 +--
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 43 +-
.../AMDGPU/GlobalISel/shl-ext-reduce.ll | 8 +-
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 86 +--
llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll | 6 +-
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 20 +-
.../GlobalISel/widen-i8-i16-scalar-loads.ll | 4 +-
.../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll | 48 +-
.../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll | 51 +-
.../abi-attribute-hints-undefined-behavior.ll | 2 +-
llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir | 174 +++---
llvm/test/CodeGen/AMDGPU/add.ll | 34 +-
.../AMDGPU/agpr-copy-no-free-registers.ll | 6 +-
.../CodeGen/AMDGPU/agpr-copy-no-vgprs.mir | 4 +-
.../CodeGen/AMDGPU/agpr-copy-reuse-writes.mir | 6 +-
.../AMDGPU/agpr-copy-sgpr-no-vgprs.mir | 4 +-
.../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 36 +-
.../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 16 +-
.../atomic_optimizations_global_pointer.ll | 16 +-
.../atomic_optimizations_local_pointer.ll | 102 ++--
llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 10 +-
llvm/test/CodeGen/AMDGPU/bitreverse.ll | 16 +-
...der-no-live-segment-at-def-implicit-def.ll | 6 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 178 +++---
.../buffer-fat-pointer-atomicrmw-fmax.ll | 148 +++--
.../buffer-fat-pointer-atomicrmw-fmin.ll | 148 +++--
.../CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll | 4 +-
llvm/test/CodeGen/AMDGPU/build_vector.ll | 2 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 30 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 12 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 52 +-
llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 20 +-
.../codegen-prepare-addrspacecast-non-null.ll | 4 +-
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 11 +-
.../CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir | 4 +-
.../CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir | 8 +-
...hys-reg-implicit-operand-kills-subregs.mir | 2 +-
llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir | 136 ++---
llvm/test/CodeGen/AMDGPU/ctlz.ll | 2 +-
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 30 +-
llvm/test/CodeGen/AMDGPU/ctpop64.ll | 6 +-
llvm/test/CodeGen/AMDGPU/cttz.ll | 2 +-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 36 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 28 +-
llvm/test/CodeGen/AMDGPU/ds_read2.ll | 28 +-
llvm/test/CodeGen/AMDGPU/ds_write2.ll | 4 +-
.../CodeGen/AMDGPU/extract_vector_dynelt.ll | 36 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fabs.ll | 2 +-
.../fast-unaligned-load-store.global.ll | 16 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 26 +-
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 12 +-
llvm/test/CodeGen/AMDGPU/fdiv.ll | 20 +-
.../CodeGen/AMDGPU/fence-lds-read2-write2.ll | 4 +-
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 4 +-
llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 48 +-
.../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 40 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 428 ++++++-------
.../AMDGPU/flat_atomics_i64_noprivate.ll | 132 ++--
.../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 216 +++----
.../flat_atomics_i64_system_noprivate.ll | 198 +++---
llvm/test/CodeGen/AMDGPU/fmed3.ll | 2 +-
.../AMDGPU/fmul-2-combine-multi-use.ll | 48 +-
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 4 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fneg.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fp-classify.ll | 28 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fshr.ll | 4 +-
.../AMDGPU/gfx-callable-return-types.ll | 4 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 12 +-
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 4 +-
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 4 +-
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 12 +-
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 4 +-
llvm/test/CodeGen/AMDGPU/global_atomics.ll | 16 +-
.../AMDGPU/global_atomics_i32_system.ll | 30 +-
.../test/CodeGen/AMDGPU/global_atomics_i64.ll | 112 ++--
.../AMDGPU/global_atomics_i64_system.ll | 90 +--
.../AMDGPU/global_atomics_scan_fadd.ll | 280 ++++-----
.../AMDGPU/global_atomics_scan_fmax.ll | 208 +++----
.../AMDGPU/global_atomics_scan_fmin.ll | 208 +++----
.../AMDGPU/global_atomics_scan_fsub.ll | 280 ++++-----
llvm/test/CodeGen/AMDGPU/half.ll | 214 +++----
.../identical-subrange-spill-infloop.ll | 6 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 8 +-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 86 +--
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 48 +-
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 8 +-
llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 6 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 83 +--
.../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll | 4 +-
.../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 128 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 102 ++--
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 62 +-
.../AMDGPU/llvm.amdgcn.image.sample.dim.ll | 4 +-
.../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 6 +-
.../AMDGPU/llvm.amdgcn.intersect_ray.ll | 88 +--
.../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll | 2 +-
.../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 96 +--
....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 22 +-
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 50 +-
.../llvm.amdgcn.pops.exiting.wave.id.ll | 68 ++-
.../llvm.amdgcn.raw.tbuffer.store.d16.ll | 3 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 14 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 2 +-
.../llvm.amdgcn.sched.group.barrier.gfx11.ll | 84 ++-
.../llvm.amdgcn.sched.group.barrier.gfx12.ll | 48 +-
.../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll | 114 ++--
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 98 +--
.../llvm.amdgcn.struct.tbuffer.store.d16.ll | 4 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 36 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 12 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 12 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 20 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 2 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 2 +-
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 32 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 6 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 6 +-
llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 22 +-
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 4 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 446 +++++++-------
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 464 +++++++-------
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 285 ++++-----
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 69 +--
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 574 +++++++++---------
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 271 +++++----
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 175 +++---
.../lower-work-group-id-intrinsics-hsa.ll | 20 +-
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 8 +-
.../CodeGen/AMDGPU/max-hard-clause-length.ll | 8 +-
.../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 4 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 4 +-
llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 48 +-
llvm/test/CodeGen/AMDGPU/memory_clause.ll | 6 +-
llvm/test/CodeGen/AMDGPU/min.ll | 20 +-
.../AMDGPU/module-lds-false-sharing.ll | 14 +-
llvm/test/CodeGen/AMDGPU/or.ll | 16 +-
.../AMDGPU/pal-simple-indirect-call.ll | 5 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 16 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 4 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 2 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 4 +-
llvm/test/CodeGen/AMDGPU/saddo.ll | 8 +-
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 32 +-
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 6 +-
llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir | 40 +-
.../sgpr-spill-update-only-slot-indexes.ll | 6 +-
.../CodeGen/AMDGPU/shift-and-i128-ubfe.ll | 4 +-
.../AMDGPU/shufflevector.v2i64.v8i64.ll | 40 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 6 +-
llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 16 +-
llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll | 2 +-
llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 2 +-
.../CodeGen/AMDGPU/spill-scavenge-offset.ll | 79 ++-
llvm/test/CodeGen/AMDGPU/srem.ll | 12 +-
llvm/test/CodeGen/AMDGPU/srem64.ll | 30 +-
...tack-pointer-offset-relative-frameindex.ll | 4 +-
.../CodeGen/AMDGPU/stacksave_stackrestore.ll | 12 +-
llvm/test/CodeGen/AMDGPU/store-local.128.ll | 19 +-
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 2 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 9 +-
.../AMDGPU/subreg-coalescer-undef-use.ll | 8 +-
llvm/test/CodeGen/AMDGPU/swdev380865.ll | 2 +-
llvm/test/CodeGen/AMDGPU/trap-abis.ll | 2 +-
llvm/test/CodeGen/AMDGPU/trunc-store.ll | 4 +-
llvm/test/CodeGen/AMDGPU/uaddo.ll | 8 +-
llvm/test/CodeGen/AMDGPU/udiv64.ll | 30 +-
llvm/test/CodeGen/AMDGPU/udivrem.ll | 2 +-
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 28 +-
llvm/test/CodeGen/AMDGPU/urem64.ll | 20 +-
llvm/test/CodeGen/AMDGPU/usubo.ll | 8 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 6 +-
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 4 +-
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 4 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +-
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +-
llvm/test/CodeGen/AMDGPU/wqm.ll | 2 +-
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 16 +-
llvm/test/CodeGen/AMDGPU/xor.ll | 12 +-
222 files changed, 5192 insertions(+), 5131 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 35667801c809d5..116bb09de0f99f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -627,13 +627,11 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
/// possible to have a direct copy in these cases on GFX908, so an intermediate
/// VGPR copy is required.
-static void indirectCopyToAGPR(const SIInstrInfo &TII,
- MachineBasicBlock &MBB,
+static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc,
RegScavenger &RS, bool RegsOverlap,
- Register ImpDefSuperReg = Register(),
Register ImpUseSuperReg = Register()) {
assert((TII.getSubtarget().hasMAIInsts() &&
!TII.getSubtarget().hasGFX90AInsts()) &&
@@ -681,10 +679,9 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
}
MachineInstrBuilder Builder =
- BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
- .add(DefOp);
- if (ImpDefSuperReg)
- Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+ BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
+ DestReg)
+ .add(DefOp);
if (ImpUseSuperReg) {
Builder.addReg(ImpUseSuperReg,
@@ -738,12 +735,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
getKillRegState(KillSrc) | RegState::Implicit);
}
- MachineInstrBuilder DefBuilder
- = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
- .addReg(Tmp, RegState::Kill);
-
- if (ImpDefSuperReg)
- DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+ BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+ .addReg(Tmp, RegState::Kill);
}
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
@@ -791,9 +784,6 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
if (!Forward)
std::swap(FirstMI, LastMI);
- FirstMI->addOperand(
- MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
-
if (KillSrc)
LastMI->addRegisterKilled(SrcReg, &RI);
}
@@ -1118,34 +1108,27 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
- bool IsFirstSubreg = Idx == 0;
bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
- Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
Register ImpUseSuper = SrcReg;
indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
- *RS, Overlap, ImpDefSuper, ImpUseSuper);
+ *RS, Overlap, ImpUseSuper);
} else if (Opcode == AMDGPU::V_PK_MOV_B32) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
- .addImm(SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0) // clamp
- .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
- if (IsFirstSubreg)
- MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
} else {
MachineInstrBuilder Builder =
BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
- if (IsFirstSubreg)
- Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index ff5880819020da..99daa0e99b8c99 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -679,8 +679,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_saddo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_add_u32 s4, s0, s2
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_addc_u32 s5, s1, s3
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -696,8 +696,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX8-LABEL: s_saddo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s5, s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -713,8 +713,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 28ed88f4cf8fb8..f48a4823a6d18a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -96,8 +96,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -192,8 +192,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -294,8 +294,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_xor_b32_e32 v4, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -392,8 +392,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -492,8 +492,8 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -598,8 +598,8 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_xor_b32_e32 v4, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 424388a30e99b4..b60b20f1f0aefb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1811,9 +1811,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1852,9 +1852,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1904,10 +1904,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1935,10 +1935,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index b52a39f1a55c8f..ebbd7856826292 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -1811,9 +1811,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1852,9 +1852,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1904,10 +1904,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1935,10 +1935,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index b96fc71be057e7..012bcfc88368e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -1025,14 +1025,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX11-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
@@ -1112,14 +1112,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
@@ -1255,9 +1255,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1329,9 +1329,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1435,18 +1435,18 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@@ -1533,13 +1533,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20
@@ -1811,9 +1811,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1876,8 +1876,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s0, 32
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1889,9 +1889,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1955,8 +1955,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s0, 32
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1968,9 +1968,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1999,8 +1999,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2
@@ -2025,8 +2025,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2
@@ -2042,15 +2042,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -2073,8 +2073,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -2085,12 +2085,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2155,8 +2156,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
@@ -2189,12 +2190,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -3059,8 +3061,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3082,8 +3084,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3232,8 +3234,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v4, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
+; CI-NEXT: v_mov_b32_e32 v4, s3
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
; CI-NEXT: v_mov_b32_e32 v4, s1
@@ -3251,8 +3253,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index e1397e7331d3ce..d08ee01f47fa93 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -1995,8 +1995,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -2018,8 +2018,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -2348,27 +2348,27 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX11-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 42
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
@@ -2448,28 +2448,28 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 42
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
@@ -2616,9 +2616,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX11-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2629,9 +2629,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX12-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 42
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -2701,9 +2701,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2714,9 +2714,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 42
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2819,18 +2819,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@@ -2841,17 +2841,17 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX12-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -2939,13 +2939,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20
@@ -2958,13 +2958,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 42
; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 scope:SCOPE_DEV
@@ -2989,8 +2989,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v4, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
+; CI-NEXT: v_mov_b32_e32 v4, s3
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
; CI-NEXT: v_mov_b32_e32 v4, s1
@@ -3008,8 +3008,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: v_mov_b32_e32 v4, s1
@@ -3495,9 +3495,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3509,9 +3509,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 42
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_mov_b32_e32 v3, s1
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3572,8 +3572,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s0, 32
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3585,9 +3585,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3599,9 +3599,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 42
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_mov_b32_e32 v3, s1
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3663,8 +3663,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s0, 32
; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3676,9 +3676,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3690,9 +3690,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 42
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: v_mov_b32_e32 v3, s1
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -3720,8 +3720,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2
@@ -3746,8 +3746,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2
@@ -3763,15 +3763,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3794,8 +3794,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3806,12 +3806,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3829,12 +3830,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 42
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3899,8 +3901,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
@@ -3933,12 +3935,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -3952,12 +3955,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 42
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
index 37fc0e0282690a..4fed1b4d2c1078 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll
@@ -8,11 +8,10 @@ define void @main(<19 x i32> %arg) {
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0
-; GCN-NEXT: s_mov_b32 s12, s4
; GCN-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b32 s12, s4
; GCN-NEXT: s_mov_b32 s13, s4
-; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_mov_b32 s6, s4
; GCN-NEXT: s_mov_b32 s7, s4
@@ -23,6 +22,7 @@ define void @main(<19 x i32> %arg) {
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: image_store v[0:3], v[4:5], s[4:11] unorm
; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 4ddbb0afd7fc58..b789304e735d1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -476,10 +476,10 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -696,6 +696,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
; VI-NEXT: flat_load_ubyte v3, v[6:7]
; VI-NEXT: flat_load_ubyte v4, v[4:5]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -706,7 +707,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v1, v2, v3
; VI-NEXT: v_or_b32_e32 v3, v1, v0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
@@ -1013,11 +1013,11 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1059,10 +1059,10 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1104,10 +1104,10 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1174,6 +1174,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
; VI-NEXT: flat_load_ubyte v3, v[6:7]
; VI-NEXT: flat_load_ubyte v4, v[4:5]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1184,7 +1185,6 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v1, v2, v3
; VI-NEXT: v_or_b32_e32 v3, v1, v0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
@@ -1229,10 +1229,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1273,10 +1273,10 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1318,10 +1318,10 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1362,10 +1362,10 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index e1ce9ea14a2a95..766c22437e97a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -55,10 +55,10 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s2, 3
-; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
@@ -72,10 +72,10 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s2, 3
-; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 021f609053a0f6..08a40ff922b60d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -35,10 +35,10 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg
; GFX7-NEXT: s_and_b32 s2, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, s2, 1
; GFX7-NEXT: s_mov_b32 s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -74,10 +74,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s2, 3
-; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
@@ -88,10 +88,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s2, 3
-; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
@@ -678,10 +678,10 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg
; GFX7-NEXT: s_and_b32 s2, s4, 7
; GFX7-NEXT: s_lshl_b32 s4, s2, 1
; GFX7-NEXT: s_mov_b32 s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -717,10 +717,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr,
; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s2, 7
-; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
@@ -731,10 +731,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr,
; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s2, 7
-; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index c2394ec461490f..0cc8c58ea688c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -35,11 +35,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p
; GFX7: ; %bb.0:
; GFX7-NEXT: s_and_b32 s4, s4, 3
; GFX7-NEXT: s_ashr_i32 s5, s4, 31
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -79,8 +79,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s2, 3
; GFX9-NEXT: s_ashr_i32 s1, s0, 31
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
@@ -92,8 +92,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_ashr_i32 s1, s0, 31
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
@@ -116,8 +116,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s0, s2, 3
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
@@ -130,7 +130,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX11-NEXT: s_and_b32 s0, s2, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s1, s0, 31
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
@@ -262,8 +262,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p
;
; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v2, 3, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
@@ -688,11 +688,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p
; GFX7: ; %bb.0:
; GFX7-NEXT: s_and_b32 s4, s4, 7
; GFX7-NEXT: s_ashr_i32 s5, s4, 31
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -732,8 +732,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s2, 7
; GFX9-NEXT: s_ashr_i32 s1, s0, 31
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
@@ -745,8 +745,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s2, 7
; GFX8-NEXT: s_ashr_i32 s1, s0, 31
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
@@ -769,8 +769,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s0, s2, 7
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
@@ -783,7 +783,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX11-NEXT: s_and_b32 s0, s2, 7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s1, s0, 31
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
@@ -915,8 +915,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p
;
; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v2, 7, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 7, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1725,11 +1725,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_and_b32 s4, s4, 15
; GFX7-NEXT: s_ashr_i32 s5, s4, 31
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1769,8 +1769,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s2, 15
; GFX9-NEXT: s_ashr_i32 s1, s0, 31
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
@@ -1782,8 +1782,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s2, 15
; GFX8-NEXT: s_ashr_i32 s1, s0, 31
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
@@ -1806,8 +1806,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s0, s2, 15
; GFX10-NEXT: s_ashr_i32 s1, s0, 31
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
@@ -1820,7 +1820,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i
; GFX11-NEXT: s_and_b32 s0, s2, 15
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s1, s0, 31
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
@@ -1952,8 +1952,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %
;
; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v2, 15, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 15, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index f2a4332bcb8ba6..26c6f04c4cd24a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3182,8 +3182,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: s_cmp_eq_u32 s8, 4
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
-; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
+; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; MOVREL-NEXT: s_endpgm
@@ -4183,8 +4183,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: s_cselect_b32 s3, 0x40400000, s3
; MOVREL-NEXT: s_cmp_eq_u32 s2, 3
; MOVREL-NEXT: s_cselect_b32 s2, 4.0, s3
-; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
+; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
; MOVREL-NEXT: flat_store_dword v[0:1], v2
; MOVREL-NEXT: s_endpgm
@@ -4534,8 +4534,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: s_cmp_eq_u32 s6, 3
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
-; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
+; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; MOVREL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index 4fdb4082346af6..13f91b09bcc6af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -11,11 +11,11 @@ define amdgpu_ps void @amdgpu_ps() {
; MESA: ; %bb.0:
; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4
; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
-; MESA-NEXT: s_mov_b32 s0, 0
; MESA-NEXT: s_mov_b64 s[2:3], src_private_base
+; MESA-NEXT: s_mov_b32 s0, 0
; MESA-NEXT: s_mov_b32 s1, s3
-; MESA-NEXT: v_mov_b32_e32 v0, s0
; MESA-NEXT: v_mov_b32_e32 v2, 0
+; MESA-NEXT: v_mov_b32_e32 v0, s0
; MESA-NEXT: v_mov_b32_e32 v1, s1
; MESA-NEXT: flat_store_dword v[0:1], v2
; MESA-NEXT: s_waitcnt vmcnt(0)
@@ -31,8 +31,8 @@ define amdgpu_ps void @amdgpu_ps() {
; PAL-NEXT: s_and_b32 s3, s3, 0xffff
; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
-; PAL-NEXT: s_mov_b32 s0, 0
; PAL-NEXT: s_mov_b64 s[2:3], src_private_base
+; PAL-NEXT: s_mov_b32 s0, 0
; PAL-NEXT: s_mov_b32 s1, s3
; PAL-NEXT: v_mov_b32_e32 v0, s0
; PAL-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e4e6c44b051c32..79351bbf66abd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -869,10 +869,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
; VI-NEXT: v_trunc_f32_e32 v2, v2
; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1031,10 +1031,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v4, -v4, v7, v5
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7
; VI-NEXT: v_trunc_f32_e32 v4, v4
; VI-NEXT: v_fma_f32 v3, -v4, v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 54cb0777e9b2b7..775681fa794810 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -21,13 +21,13 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-NEXT: s_mov_b32 s6, s1
; GFX8V4-NEXT: s_mov_b32 s7, s2
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
-; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
-; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v2, 2
+; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
@@ -42,16 +42,16 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: s_mov_b32 s4, s0
; GFX8V5-NEXT: s_mov_b32 s5, s2
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
-; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX8V5-NEXT: s_mov_b32 s2, s1
+; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1
-; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0
+; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
-; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v2, 2
+; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
@@ -66,16 +66,16 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V4-NEXT: s_mov_b32 s2, s0
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
-; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX9V4-NEXT: s_mov_b32 s4, s1
+; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
; GFX9V4-NEXT: v_mov_b32_e32 v1, s3
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
-; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
; GFX9V4-NEXT: v_mov_b32_e32 v2, 2
+; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
; GFX9V4-NEXT: v_mov_b32_e32 v1, s1
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -90,16 +90,16 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: s_mov_b32 s2, s0
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
-; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX9V5-NEXT: s_mov_b32 s4, s1
+; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
; GFX9V5-NEXT: v_mov_b32_e32 v1, s3
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
-; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
; GFX9V5-NEXT: v_mov_b32_e32 v2, 2
+; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
; GFX9V5-NEXT: v_mov_b32_e32 v1, s1
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
@@ -269,11 +269,11 @@ define amdgpu_kernel void @llvm_debugtrap() {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
+; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
-; GFX8V4-NEXT: s_add_u32 s0, s8, 8
-; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V4-NEXT: s_addc_u32 s1, s9, 0
+; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
@@ -295,11 +295,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
+; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
-; GFX8V5-NEXT: s_add_u32 s0, s8, 8
-; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_addc_u32 s1, s9, 0
+; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 4ae98ff1edf6c9..74f743ea9b389e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -1025,17 +1025,17 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: s_cmp_eq_u32 s2, 1
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: s_cselect_b32 s3, s1, s0
; GFX11-NEXT: s_and_b32 s4, s4, 1
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_lshl_b32 s4, s4, 4
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX11-NEXT: s_and_not1_b32 s3, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -1144,9 +1144,9 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX10-NEXT: v_not_b32_e32 v2, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3
; GFX10-NEXT: v_mov_b32_e32 v2, 0
@@ -1166,11 +1166,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX11-NEXT: v_not_b32_e32 v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -1281,8 +1281,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2
@@ -1306,8 +1306,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX11-NEXT: v_not_b32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -1792,8 +1792,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_cselect_b32 s2, s4, s2
; GFX7-NEXT: s_cmp_eq_u32 s6, 3
; GFX7-NEXT: s_cselect_b32 s3, s4, s3
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
@@ -1865,10 +1865,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: s_cselect_b32 s2, s4, s2
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-NEXT: s_cselect_b32 s3, s4, s3
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_endpgm
%vec = load <8 x i16>, ptr addrspace(4) %ptr
@@ -2181,21 +2180,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: s_cmp_eq_u32 s5, 1
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: s_cselect_b32 s6, s1, s0
; GFX11-NEXT: s_cmp_eq_u32 s5, 2
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_cselect_b32 s6, s2, s6
; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_cselect_b32 s6, s3, s6
; GFX11-NEXT: s_and_b32 s4, s4, 1
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NEXT: s_lshl_b32 s4, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s4, s4, 4
; GFX11-NEXT: s_lshl_b32 s7, 0xffff, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 s6, s6, s7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1
; GFX11-NEXT: v_mov_b32_e32 v4, 0
@@ -2343,13 +2343,13 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX10-NEXT: v_not_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s9
+; GFX10-NEXT: v_mov_b32_e32 v1, s9
+; GFX10-NEXT: v_mov_b32_e32 v2, s10
+; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: v_mov_b32_e32 v2, s10
-; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
@@ -2366,27 +2366,29 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v0
; GFX11-NEXT: v_and_b32_e32 v1, 1, v0
; GFX11-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, s11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
+; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_not_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, s10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
@@ -2527,13 +2529,13 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX10-NEXT: v_not_b32_e32 v5, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
@@ -2560,14 +2562,14 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
; GFX11-NEXT: v_not_b32_e32 v5, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_mov_b32_e32 v3, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v4, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v4, 0
@@ -3088,17 +3090,17 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i
; GFX9-NEXT: s_cmp_eq_u32 s7, 5
; GFX9-NEXT: s_cselect_b32 s5, s16, s13
; GFX9-NEXT: s_cmp_eq_u32 s7, 6
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_cselect_b32 s6, s16, s14
; GFX9-NEXT: s_cmp_eq_u32 s7, 7
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cselect_b32 s7, s16, s15
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: v_mov_b32_e32 v4, 16
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
@@ -3118,17 +3120,17 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: s_andn2_b32 s0, s0, s1
; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: s_movreld_b32 s8, s0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: s_movreld_b32 s8, s0
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
@@ -3149,8 +3151,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: s_andn2_b32 s0, s0, s1
; GFX7-NEXT: s_or_b32 s0, s0, s2
; GFX7-NEXT: s_movreld_b32 s8, s0
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s11
@@ -3541,12 +3543,12 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i
; GFX11-NEXT: v_mov_b32_e32 v10, 16
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_movrels_b32 s2, s8
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
; GFX11-NEXT: s_and_not1_b32 s1, s2, s1
-; GFX11-NEXT: v_mov_b32_e32 v1, s9
-; GFX11-NEXT: v_lshl_or_b32 v12, v8, s0, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT: v_lshl_or_b32 v12, v8, s0, s1
; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15
; GFX11-NEXT: v_mov_b32_e32 v6, s14
; GFX11-NEXT: v_mov_b32_e32 v8, 0
@@ -3764,30 +3766,30 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_not_b32_e32 v9, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v2, s10
+; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v4, s12
+; GFX10-NEXT: v_mov_b32_e32 v5, s13
+; GFX10-NEXT: v_mov_b32_e32 v6, s14
+; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: v_mov_b32_e32 v2, s10
-; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8
-; GFX10-NEXT: v_mov_b32_e32 v4, s12
-; GFX10-NEXT: v_mov_b32_e32 v5, s13
-; GFX10-NEXT: v_mov_b32_e32 v6, s14
-; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 16
+; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
@@ -3817,9 +3819,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
-; GFX11-NEXT: v_not_b32_e32 v9, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
+; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo
+; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; GFX11-NEXT: v_not_b32_e32 v9, v2
+; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1
@@ -3827,20 +3833,18 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
+; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8
-; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
-; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
@@ -4055,30 +4059,30 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: v_not_b32_e32 v9, v3
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v1, s9
+; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v4, s12
+; GFX10-NEXT: v_mov_b32_e32 v5, s13
+; GFX10-NEXT: v_mov_b32_e32 v6, s14
+; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
-; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8
-; GFX10-NEXT: v_mov_b32_e32 v4, s12
-; GFX10-NEXT: v_mov_b32_e32 v5, s13
-; GFX10-NEXT: v_mov_b32_e32 v6, s14
-; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 16
+; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
@@ -4103,37 +4107,39 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s9
; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0
-; GFX11-NEXT: v_not_b32_e32 v9, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13
+; GFX11-NEXT: v_mov_b32_e32 v1, s9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo
+; GFX11-NEXT: v_mov_b32_e32 v4, s12
+; GFX11-NEXT: v_mov_b32_e32 v6, s14
+; GFX11-NEXT: v_not_b32_e32 v9, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, s11
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v7, s15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13
-; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
-; GFX11-NEXT: v_mov_b32_e32 v7, s15
-; GFX11-NEXT: v_mov_b32_e32 v3, s11
+; GFX11-NEXT: v_mov_b32_e32 v2, s10
; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8
-; GFX11-NEXT: v_mov_b32_e32 v4, s12
-; GFX11-NEXT: v_mov_b32_e32 v6, s14
; GFX11-NEXT: v_mov_b32_e32 v8, 0
-; GFX11-NEXT: v_mov_b32_e32 v9, 0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v13 :: v_dual_mov_b32 v10, 16
+; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index d4b9bc6d2e3c1d..7902a5db212a3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -1592,8 +1592,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_cselect_b32 s2, s4, s0
; GFX7-NEXT: s_cmp_eq_u32 s3, 1
; GFX7-NEXT: s_cselect_b32 s3, s4, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -1620,8 +1620,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8
; GFX10-NEXT: s_cselect_b32 s0, s3, s0
; GFX10-NEXT: s_cmp_eq_u32 s2, 1
; GFX10-NEXT: s_cselect_b32 s1, s3, s1
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT: s_endpgm
;
@@ -1888,17 +1888,17 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: s_cmp_eq_u32 s2, 1
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: s_cselect_b32 s3, s1, s0
; GFX11-NEXT: s_and_b32 s4, s4, 3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-NEXT: s_lshl_b32 s5, 0xff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xff, s4
; GFX11-NEXT: s_and_not1_b32 s3, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -2007,9 +2007,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX10-NEXT: v_not_b32_e32 v2, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3
; GFX10-NEXT: v_mov_b32_e32 v2, 0
@@ -2029,11 +2029,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 3, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX11-NEXT: v_not_b32_e32 v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2144,8 +2144,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2
@@ -2169,8 +2169,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
; GFX11-NEXT: v_not_b32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2655,8 +2655,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_cselect_b32 s2, s4, s2
; GFX7-NEXT: s_cmp_eq_u32 s6, 3
; GFX7-NEXT: s_cselect_b32 s3, s4, s3
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
@@ -2728,10 +2728,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: s_cselect_b32 s2, s4, s2
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-NEXT: s_cselect_b32 s3, s4, s3
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_endpgm
%vec = load <16 x i8>, ptr addrspace(4) %ptr
@@ -3044,21 +3043,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: s_cmp_eq_u32 s5, 1
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
; GFX11-NEXT: s_cselect_b32 s6, s1, s0
; GFX11-NEXT: s_cmp_eq_u32 s5, 2
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_cselect_b32 s6, s2, s6
; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_cselect_b32 s6, s3, s6
; GFX11-NEXT: s_and_b32 s4, s4, 3
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-NEXT: s_lshl_b32 s7, 0xff, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 s6, s6, s7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1
; GFX11-NEXT: v_mov_b32_e32 v4, 0
@@ -3206,13 +3206,13 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX10-NEXT: v_not_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s9
+; GFX10-NEXT: v_mov_b32_e32 v1, s9
+; GFX10-NEXT: v_mov_b32_e32 v2, s10
+; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: v_mov_b32_e32 v2, s10
-; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
@@ -3229,27 +3229,29 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 2, v0
; GFX11-NEXT: v_and_b32_e32 v1, 3, v0
; GFX11-NEXT: s_and_b32 s1, s4, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 3, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, s11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
+; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_not_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, s10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
@@ -3390,13 +3392,13 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX10-NEXT: v_not_b32_e32 v5, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
@@ -3423,14 +3425,14 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
; GFX11-NEXT: v_not_b32_e32 v5, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_mov_b32_e32 v3, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v4, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4
; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 298dfcf048fc46..4c3aee6e712b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -304,12 +304,12 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v8, v0
+; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_mov_b32 m0, s10
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
-; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT: v_mov_b32_e32 v6, s6
; GFX11-NEXT: v_movreld_b32_e32 v0, v8
; GFX11-NEXT: ; return to shader part epilog
entry:
@@ -691,7 +691,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: s_mov_b32 s14, 0
; GPRIDX-NEXT: s_mov_b32 s12, 0
; GPRIDX-NEXT: s_mov_b32 s8, 0
-; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0
; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000
; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000
; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000
@@ -699,6 +698,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0
; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000
; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0
+; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0
; GPRIDX-NEXT: v_mov_b32_e32 v3, s4
; GPRIDX-NEXT: v_mov_b32_e32 v4, s5
; GPRIDX-NEXT: v_mov_b32_e32 v5, s6
@@ -818,22 +818,22 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GFX11-LABEL: dyn_insertelement_v8f64_const_s_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b64 s[0:1], 1.0
; GFX11-NEXT: s_mov_b32 s14, 0
-; GFX11-NEXT: s_mov_b32 s15, 0x40200000
; GFX11-NEXT: s_mov_b32 s12, 0
; GFX11-NEXT: s_mov_b32 s10, 0
; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_mov_b64 s[0:1], 1.0
+; GFX11-NEXT: s_mov_b32 s15, 0x40200000
; GFX11-NEXT: s_mov_b32 s13, 0x401c0000
; GFX11-NEXT: s_mov_b32 s11, 0x40180000
; GFX11-NEXT: s_mov_b32 s9, 0x40140000
; GFX11-NEXT: s_mov_b64 s[6:7], 4.0
; GFX11-NEXT: s_mov_b32 s5, 0x40080000
; GFX11-NEXT: s_mov_b64 s[2:3], 2.0
-; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
@@ -884,22 +884,22 @@ entry:
define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_v:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s9, s11
-; GPRIDX-NEXT: s_mov_b32 s11, s13
-; GPRIDX-NEXT: s_mov_b32 s13, s15
-; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 s12, s14
+; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
+; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: v_mov_b32_e32 v16, s15
; GPRIDX-NEXT: v_mov_b32_e32 v15, s14
; GPRIDX-NEXT: v_mov_b32_e32 v14, s13
@@ -954,26 +954,26 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
;
; GFX10-LABEL: dyn_insertelement_v8f64_s_s_v:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s9, s11
-; GFX10-NEXT: s_mov_b32 s11, s13
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_mov_b32_e32 v16, s15
+; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_mov_b32_e32 v16, s15
; GFX10-NEXT: v_mov_b32_e32 v15, s14
; GFX10-NEXT: v_mov_b32_e32 v14, s13
; GFX10-NEXT: v_mov_b32_e32 v13, s12
@@ -1022,25 +1022,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
;
; GFX11-LABEL: dyn_insertelement_v8f64_s_s_v:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: s_mov_b32 s11, s13
-; GFX11-NEXT: s_mov_b32 s13, s15
-; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: s_mov_b32 s11, s13
; GFX11-NEXT: s_mov_b32 s12, s14
+; GFX11-NEXT: s_mov_b32 s13, s15
; GFX11-NEXT: s_mov_b32 s14, s16
-; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14
+; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14
; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12
; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10
; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8
@@ -1095,22 +1095,22 @@ entry:
define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s9, s11
-; GPRIDX-NEXT: s_mov_b32 s11, s13
-; GPRIDX-NEXT: s_mov_b32 s13, s15
-; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 s12, s14
+; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
+; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: v_mov_b32_e32 v17, s15
; GPRIDX-NEXT: v_mov_b32_e32 v16, s14
; GPRIDX-NEXT: v_mov_b32_e32 v15, s13
@@ -1144,25 +1144,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
;
; GFX10-LABEL: dyn_insertelement_v8f64_s_v_s:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s9, s11
-; GFX10-NEXT: s_mov_b32 s11, s13
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_mov_b32_e32 v17, s15
+; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_lshl_b32 m0, s18, 1
+; GFX10-NEXT: v_mov_b32_e32 v17, s15
; GFX10-NEXT: v_mov_b32_e32 v16, s14
; GFX10-NEXT: v_mov_b32_e32 v15, s13
; GFX10-NEXT: v_mov_b32_e32 v14, s12
@@ -1191,25 +1191,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
;
; GFX11-LABEL: dyn_insertelement_v8f64_s_v_s:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: s_mov_b32 s11, s13
-; GFX11-NEXT: s_mov_b32 s13, s15
-; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: s_mov_b32 s11, s13
; GFX11-NEXT: s_mov_b32 s12, s14
+; GFX11-NEXT: s_mov_b32 s13, s15
; GFX11-NEXT: s_mov_b32 s14, s16
-; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
+; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_lshl_b32 m0, s18, 1
+; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
@@ -1303,22 +1303,22 @@ entry:
define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_v:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s9, s11
-; GPRIDX-NEXT: s_mov_b32 s11, s13
-; GPRIDX-NEXT: s_mov_b32 s13, s15
-; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 s12, s14
+; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
+; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: v_mov_b32_e32 v18, s15
; GPRIDX-NEXT: v_mov_b32_e32 v17, s14
; GPRIDX-NEXT: v_mov_b32_e32 v16, s13
@@ -1371,26 +1371,26 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
;
; GFX10-LABEL: dyn_insertelement_v8f64_s_v_v:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s9, s11
-; GFX10-NEXT: s_mov_b32 s11, s13
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_mov_b32_e32 v18, s15
+; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: v_mov_b32_e32 v4, s1
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_mov_b32_e32 v18, s15
; GFX10-NEXT: v_mov_b32_e32 v17, s14
; GFX10-NEXT: v_mov_b32_e32 v16, s13
; GFX10-NEXT: v_mov_b32_e32 v15, s12
@@ -1439,25 +1439,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
;
; GFX11-LABEL: dyn_insertelement_v8f64_s_v_v:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: s_mov_b32 s11, s13
-; GFX11-NEXT: s_mov_b32 s13, s15
-; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: s_mov_b32 s11, s13
; GFX11-NEXT: s_mov_b32 s12, s14
+; GFX11-NEXT: s_mov_b32 s13, s15
; GFX11-NEXT: s_mov_b32 s14, s16
-; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
+; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
@@ -2832,13 +2832,13 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: v_mov_b32_e32 v10, v0
+; GFX11-NEXT: v_dual_mov_b32 v10, v0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_mov_b32 m0, s12
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
-; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
-; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT: v_mov_b32_e32 v8, s8
; GFX11-NEXT: v_movreld_b32_e32 v0, v10
; GFX11-NEXT: ; return to shader part epilog
entry:
@@ -3348,14 +3348,14 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg
; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s10, s12
; GFX11-NEXT: s_mov_b32 s11, s13
-; GFX11-NEXT: v_mov_b32_e32 v12, v0
+; GFX11-NEXT: v_dual_mov_b32 v12, v0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_mov_b32 m0, s14
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
-; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
-; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
-; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT: v_mov_b32_e32 v10, s10
; GFX11-NEXT: v_movreld_b32_e32 v0, v12
; GFX11-NEXT: ; return to shader part epilog
entry:
@@ -4077,22 +4077,22 @@ entry:
define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %vec, i32 %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v16i32_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s9, s11
-; GPRIDX-NEXT: s_mov_b32 s11, s13
-; GPRIDX-NEXT: s_mov_b32 s13, s15
-; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
+; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s10, s12
+; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 s12, s14
+; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
+; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: v_mov_b32_e32 v16, s15
; GPRIDX-NEXT: v_mov_b32_e32 v15, s14
; GPRIDX-NEXT: v_mov_b32_e32 v14, s13
@@ -4132,25 +4132,25 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %ve
;
; GFX10-LABEL: dyn_insertelement_v16i32_s_v_s:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s9, s11
-; GFX10-NEXT: s_mov_b32 s11, s13
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
+; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s10, s12
+; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_mov_b32_e32 v16, s15
+; GFX10-NEXT: s_mov_b32 s15, s17
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_mov_b32 m0, s18
+; GFX10-NEXT: v_mov_b32_e32 v16, s15
; GFX10-NEXT: v_mov_b32_e32 v15, s14
; GFX10-NEXT: v_mov_b32_e32 v14, s13
; GFX10-NEXT: v_mov_b32_e32 v13, s12
@@ -4186,25 +4186,25 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %ve
;
; GFX11-LABEL: dyn_insertelement_v16i32_s_v_s:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s9, s11
-; GFX11-NEXT: s_mov_b32 s11, s13
-; GFX11-NEXT: s_mov_b32 s13, s15
-; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
+; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s10, s12
+; GFX11-NEXT: s_mov_b32 s11, s13
; GFX11-NEXT: s_mov_b32 s12, s14
+; GFX11-NEXT: s_mov_b32 s13, s15
; GFX11-NEXT: s_mov_b32 s14, s16
-; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14
+; GFX11-NEXT: s_mov_b32 s15, s17
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_mov_b32 m0, s18
+; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14
; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12
; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10
; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8
@@ -4332,16 +4332,16 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg
; GFX11-NEXT: s_mov_b32 s13, s15
; GFX11-NEXT: s_mov_b32 s14, s16
; GFX11-NEXT: s_mov_b32 s15, s17
-; GFX11-NEXT: v_mov_b32_e32 v16, v0
+; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_mov_b32 m0, s18
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
-; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
-; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
-; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10
-; GFX11-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s12
-; GFX11-NEXT: v_dual_mov_b32 v15, s15 :: v_dual_mov_b32 v14, s14
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s13
+; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15
+; GFX11-NEXT: v_mov_b32_e32 v14, s14
; GFX11-NEXT: v_movreld_b32_e32 v0, v16
; GFX11-NEXT: ; return to shader part epilog
entry:
@@ -4527,24 +4527,24 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg
; GFX11-NEXT: s_mov_b32 s29, s31
; GFX11-NEXT: s_mov_b32 s31, s33
; GFX11-NEXT: s_mov_b32 s30, s32
-; GFX11-NEXT: v_mov_b32_e32 v32, v0
+; GFX11-NEXT: v_dual_mov_b32 v32, v0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_mov_b32 m0, s34
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
-; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
-; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8
-; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10
-; GFX11-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s12
-; GFX11-NEXT: v_dual_mov_b32 v15, s15 :: v_dual_mov_b32 v14, s14
-; GFX11-NEXT: v_dual_mov_b32 v17, s17 :: v_dual_mov_b32 v16, s16
-; GFX11-NEXT: v_dual_mov_b32 v19, s19 :: v_dual_mov_b32 v18, s18
-; GFX11-NEXT: v_dual_mov_b32 v21, s21 :: v_dual_mov_b32 v20, s20
-; GFX11-NEXT: v_dual_mov_b32 v23, s23 :: v_dual_mov_b32 v22, s22
-; GFX11-NEXT: v_dual_mov_b32 v25, s25 :: v_dual_mov_b32 v24, s24
-; GFX11-NEXT: v_dual_mov_b32 v27, s27 :: v_dual_mov_b32 v26, s26
-; GFX11-NEXT: v_dual_mov_b32 v29, s29 :: v_dual_mov_b32 v28, s28
-; GFX11-NEXT: v_dual_mov_b32 v31, s31 :: v_dual_mov_b32 v30, s30
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9
+; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
+; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s13
+; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15
+; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v17, s17
+; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19
+; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21
+; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v23, s23
+; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v25, s25
+; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v27, s27
+; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v29, s29
+; GFX11-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v31, s31
+; GFX11-NEXT: v_mov_b32_e32 v30, s30
; GFX11-NEXT: v_movreld_b32_e32 v0, v32
; GFX11-NEXT: ; return to shader part epilog
entry:
@@ -4555,40 +4555,40 @@ entry:
define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %vec, i64 %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v16i64_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: s_mov_b32 s10, s12
; GPRIDX-NEXT: s_mov_b32 s11, s13
+; GPRIDX-NEXT: s_mov_b32 s12, s14
; GPRIDX-NEXT: s_mov_b32 s13, s15
+; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
+; GPRIDX-NEXT: s_mov_b32 s16, s18
; GPRIDX-NEXT: s_mov_b32 s17, s19
+; GPRIDX-NEXT: s_mov_b32 s18, s20
; GPRIDX-NEXT: s_mov_b32 s19, s21
+; GPRIDX-NEXT: s_mov_b32 s20, s22
; GPRIDX-NEXT: s_mov_b32 s21, s23
+; GPRIDX-NEXT: s_mov_b32 s22, s24
; GPRIDX-NEXT: s_mov_b32 s23, s25
+; GPRIDX-NEXT: s_mov_b32 s24, s26
; GPRIDX-NEXT: s_mov_b32 s25, s27
+; GPRIDX-NEXT: s_mov_b32 s26, s28
; GPRIDX-NEXT: s_mov_b32 s27, s29
+; GPRIDX-NEXT: s_mov_b32 s28, s30
; GPRIDX-NEXT: s_mov_b32 s29, s31
; GPRIDX-NEXT: s_mov_b32 s31, s33
-; GPRIDX-NEXT: s_mov_b32 s0, s2
-; GPRIDX-NEXT: s_mov_b32 s2, s4
-; GPRIDX-NEXT: s_mov_b32 s4, s6
-; GPRIDX-NEXT: s_mov_b32 s6, s8
-; GPRIDX-NEXT: s_mov_b32 s8, s10
-; GPRIDX-NEXT: s_mov_b32 s10, s12
-; GPRIDX-NEXT: s_mov_b32 s12, s14
-; GPRIDX-NEXT: s_mov_b32 s14, s16
-; GPRIDX-NEXT: s_mov_b32 s16, s18
-; GPRIDX-NEXT: s_mov_b32 s18, s20
-; GPRIDX-NEXT: s_mov_b32 s20, s22
-; GPRIDX-NEXT: s_mov_b32 s22, s24
-; GPRIDX-NEXT: s_mov_b32 s24, s26
-; GPRIDX-NEXT: s_mov_b32 s26, s28
-; GPRIDX-NEXT: s_mov_b32 s28, s30
; GPRIDX-NEXT: s_mov_b32 s30, s32
-; GPRIDX-NEXT: v_mov_b32_e32 v33, s31
; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1
+; GPRIDX-NEXT: v_mov_b32_e32 v33, s31
; GPRIDX-NEXT: v_mov_b32_e32 v32, s30
; GPRIDX-NEXT: v_mov_b32_e32 v31, s29
; GPRIDX-NEXT: v_mov_b32_e32 v30, s28
@@ -4660,41 +4660,41 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve
;
; GFX10-LABEL: dyn_insertelement_v16i64_s_v_s:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: s_mov_b32 s10, s12
; GFX10-NEXT: s_mov_b32 s11, s13
+; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: s_mov_b32 s13, s15
+; GFX10-NEXT: s_mov_b32 s14, s16
; GFX10-NEXT: s_mov_b32 s15, s17
+; GFX10-NEXT: s_mov_b32 s16, s18
; GFX10-NEXT: s_mov_b32 s17, s19
+; GFX10-NEXT: s_mov_b32 s18, s20
; GFX10-NEXT: s_mov_b32 s19, s21
+; GFX10-NEXT: s_mov_b32 s20, s22
; GFX10-NEXT: s_mov_b32 s21, s23
+; GFX10-NEXT: s_mov_b32 s22, s24
; GFX10-NEXT: s_mov_b32 s23, s25
+; GFX10-NEXT: s_mov_b32 s24, s26
; GFX10-NEXT: s_mov_b32 s25, s27
+; GFX10-NEXT: s_mov_b32 s26, s28
; GFX10-NEXT: s_mov_b32 s27, s29
+; GFX10-NEXT: s_mov_b32 s28, s30
; GFX10-NEXT: s_mov_b32 s29, s31
; GFX10-NEXT: s_mov_b32 s31, s33
-; GFX10-NEXT: s_mov_b32 s0, s2
-; GFX10-NEXT: s_mov_b32 s2, s4
-; GFX10-NEXT: s_mov_b32 s4, s6
-; GFX10-NEXT: s_mov_b32 s6, s8
-; GFX10-NEXT: s_mov_b32 s8, s10
-; GFX10-NEXT: s_mov_b32 s10, s12
-; GFX10-NEXT: s_mov_b32 s12, s14
-; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: s_mov_b32 s16, s18
-; GFX10-NEXT: s_mov_b32 s18, s20
-; GFX10-NEXT: s_mov_b32 s20, s22
-; GFX10-NEXT: s_mov_b32 s22, s24
-; GFX10-NEXT: s_mov_b32 s24, s26
-; GFX10-NEXT: s_mov_b32 s26, s28
-; GFX10-NEXT: s_mov_b32 s28, s30
; GFX10-NEXT: s_mov_b32 s30, s32
-; GFX10-NEXT: v_mov_b32_e32 v33, s31
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_lshl_b32 m0, s34, 1
+; GFX10-NEXT: v_mov_b32_e32 v33, s31
; GFX10-NEXT: v_mov_b32_e32 v32, s30
; GFX10-NEXT: v_mov_b32_e32 v31, s29
; GFX10-NEXT: v_mov_b32_e32 v30, s28
@@ -4763,41 +4763,41 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve
;
; GFX11-LABEL: dyn_insertelement_v16i64_s_v_s:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: s_mov_b32 s10, s12
; GFX11-NEXT: s_mov_b32 s11, s13
+; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s13, s15
+; GFX11-NEXT: s_mov_b32 s14, s16
; GFX11-NEXT: s_mov_b32 s15, s17
+; GFX11-NEXT: s_mov_b32 s16, s18
; GFX11-NEXT: s_mov_b32 s17, s19
+; GFX11-NEXT: s_mov_b32 s18, s20
; GFX11-NEXT: s_mov_b32 s19, s21
+; GFX11-NEXT: s_mov_b32 s20, s22
; GFX11-NEXT: s_mov_b32 s21, s23
+; GFX11-NEXT: s_mov_b32 s22, s24
; GFX11-NEXT: s_mov_b32 s23, s25
+; GFX11-NEXT: s_mov_b32 s24, s26
; GFX11-NEXT: s_mov_b32 s25, s27
+; GFX11-NEXT: s_mov_b32 s26, s28
; GFX11-NEXT: s_mov_b32 s27, s29
+; GFX11-NEXT: s_mov_b32 s28, s30
; GFX11-NEXT: s_mov_b32 s29, s31
; GFX11-NEXT: s_mov_b32 s31, s33
-; GFX11-NEXT: s_mov_b32 s0, s2
-; GFX11-NEXT: s_mov_b32 s2, s4
-; GFX11-NEXT: s_mov_b32 s4, s6
-; GFX11-NEXT: s_mov_b32 s6, s8
-; GFX11-NEXT: s_mov_b32 s8, s10
-; GFX11-NEXT: s_mov_b32 s10, s12
-; GFX11-NEXT: s_mov_b32 s12, s14
-; GFX11-NEXT: s_mov_b32 s14, s16
-; GFX11-NEXT: s_mov_b32 s16, s18
-; GFX11-NEXT: s_mov_b32 s18, s20
-; GFX11-NEXT: s_mov_b32 s20, s22
-; GFX11-NEXT: s_mov_b32 s22, s24
-; GFX11-NEXT: s_mov_b32 s24, s26
-; GFX11-NEXT: s_mov_b32 s26, s28
-; GFX11-NEXT: s_mov_b32 s28, s30
; GFX11-NEXT: s_mov_b32 s30, s32
-; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_lshl_b32 m0, s34, 1
+; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30
; GFX11-NEXT: v_dual_mov_b32 v31, s29 :: v_dual_mov_b32 v30, s28
; GFX11-NEXT: v_dual_mov_b32 v29, s27 :: v_dual_mov_b32 v28, s26
; GFX11-NEXT: v_dual_mov_b32 v27, s25 :: v_dual_mov_b32 v26, s24
@@ -4855,40 +4855,40 @@ entry:
define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inreg %vec, double %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v16f64_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
+; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
+; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
+; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
+; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
+; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: s_mov_b32 s9, s11
+; GPRIDX-NEXT: s_mov_b32 s10, s12
; GPRIDX-NEXT: s_mov_b32 s11, s13
+; GPRIDX-NEXT: s_mov_b32 s12, s14
; GPRIDX-NEXT: s_mov_b32 s13, s15
+; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
+; GPRIDX-NEXT: s_mov_b32 s16, s18
; GPRIDX-NEXT: s_mov_b32 s17, s19
+; GPRIDX-NEXT: s_mov_b32 s18, s20
; GPRIDX-NEXT: s_mov_b32 s19, s21
+; GPRIDX-NEXT: s_mov_b32 s20, s22
; GPRIDX-NEXT: s_mov_b32 s21, s23
+; GPRIDX-NEXT: s_mov_b32 s22, s24
; GPRIDX-NEXT: s_mov_b32 s23, s25
+; GPRIDX-NEXT: s_mov_b32 s24, s26
; GPRIDX-NEXT: s_mov_b32 s25, s27
+; GPRIDX-NEXT: s_mov_b32 s26, s28
; GPRIDX-NEXT: s_mov_b32 s27, s29
+; GPRIDX-NEXT: s_mov_b32 s28, s30
; GPRIDX-NEXT: s_mov_b32 s29, s31
; GPRIDX-NEXT: s_mov_b32 s31, s33
-; GPRIDX-NEXT: s_mov_b32 s0, s2
-; GPRIDX-NEXT: s_mov_b32 s2, s4
-; GPRIDX-NEXT: s_mov_b32 s4, s6
-; GPRIDX-NEXT: s_mov_b32 s6, s8
-; GPRIDX-NEXT: s_mov_b32 s8, s10
-; GPRIDX-NEXT: s_mov_b32 s10, s12
-; GPRIDX-NEXT: s_mov_b32 s12, s14
-; GPRIDX-NEXT: s_mov_b32 s14, s16
-; GPRIDX-NEXT: s_mov_b32 s16, s18
-; GPRIDX-NEXT: s_mov_b32 s18, s20
-; GPRIDX-NEXT: s_mov_b32 s20, s22
-; GPRIDX-NEXT: s_mov_b32 s22, s24
-; GPRIDX-NEXT: s_mov_b32 s24, s26
-; GPRIDX-NEXT: s_mov_b32 s26, s28
-; GPRIDX-NEXT: s_mov_b32 s28, s30
; GPRIDX-NEXT: s_mov_b32 s30, s32
-; GPRIDX-NEXT: v_mov_b32_e32 v33, s31
; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1
+; GPRIDX-NEXT: v_mov_b32_e32 v33, s31
; GPRIDX-NEXT: v_mov_b32_e32 v32, s30
; GPRIDX-NEXT: v_mov_b32_e32 v31, s29
; GPRIDX-NEXT: v_mov_b32_e32 v30, s28
@@ -4960,41 +4960,41 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr
;
; GFX10-LABEL: dyn_insertelement_v16f64_s_v_s:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
+; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
+; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
+; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
+; GFX10-NEXT: s_mov_b32 s10, s12
; GFX10-NEXT: s_mov_b32 s11, s13
+; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: s_mov_b32 s13, s15
+; GFX10-NEXT: s_mov_b32 s14, s16
; GFX10-NEXT: s_mov_b32 s15, s17
+; GFX10-NEXT: s_mov_b32 s16, s18
; GFX10-NEXT: s_mov_b32 s17, s19
+; GFX10-NEXT: s_mov_b32 s18, s20
; GFX10-NEXT: s_mov_b32 s19, s21
+; GFX10-NEXT: s_mov_b32 s20, s22
; GFX10-NEXT: s_mov_b32 s21, s23
+; GFX10-NEXT: s_mov_b32 s22, s24
; GFX10-NEXT: s_mov_b32 s23, s25
+; GFX10-NEXT: s_mov_b32 s24, s26
; GFX10-NEXT: s_mov_b32 s25, s27
+; GFX10-NEXT: s_mov_b32 s26, s28
; GFX10-NEXT: s_mov_b32 s27, s29
+; GFX10-NEXT: s_mov_b32 s28, s30
; GFX10-NEXT: s_mov_b32 s29, s31
; GFX10-NEXT: s_mov_b32 s31, s33
-; GFX10-NEXT: s_mov_b32 s0, s2
-; GFX10-NEXT: s_mov_b32 s2, s4
-; GFX10-NEXT: s_mov_b32 s4, s6
-; GFX10-NEXT: s_mov_b32 s6, s8
-; GFX10-NEXT: s_mov_b32 s8, s10
-; GFX10-NEXT: s_mov_b32 s10, s12
-; GFX10-NEXT: s_mov_b32 s12, s14
-; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: s_mov_b32 s16, s18
-; GFX10-NEXT: s_mov_b32 s18, s20
-; GFX10-NEXT: s_mov_b32 s20, s22
-; GFX10-NEXT: s_mov_b32 s22, s24
-; GFX10-NEXT: s_mov_b32 s24, s26
-; GFX10-NEXT: s_mov_b32 s26, s28
-; GFX10-NEXT: s_mov_b32 s28, s30
; GFX10-NEXT: s_mov_b32 s30, s32
-; GFX10-NEXT: v_mov_b32_e32 v33, s31
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_lshl_b32 m0, s34, 1
+; GFX10-NEXT: v_mov_b32_e32 v33, s31
; GFX10-NEXT: v_mov_b32_e32 v32, s30
; GFX10-NEXT: v_mov_b32_e32 v31, s29
; GFX10-NEXT: v_mov_b32_e32 v30, s28
@@ -5063,41 +5063,41 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr
;
; GFX11-LABEL: dyn_insertelement_v16f64_s_v_s:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
+; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
+; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
+; GFX11-NEXT: s_mov_b32 s8, s10
; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: s_mov_b32 s10, s12
; GFX11-NEXT: s_mov_b32 s11, s13
+; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s13, s15
+; GFX11-NEXT: s_mov_b32 s14, s16
; GFX11-NEXT: s_mov_b32 s15, s17
+; GFX11-NEXT: s_mov_b32 s16, s18
; GFX11-NEXT: s_mov_b32 s17, s19
+; GFX11-NEXT: s_mov_b32 s18, s20
; GFX11-NEXT: s_mov_b32 s19, s21
+; GFX11-NEXT: s_mov_b32 s20, s22
; GFX11-NEXT: s_mov_b32 s21, s23
+; GFX11-NEXT: s_mov_b32 s22, s24
; GFX11-NEXT: s_mov_b32 s23, s25
+; GFX11-NEXT: s_mov_b32 s24, s26
; GFX11-NEXT: s_mov_b32 s25, s27
+; GFX11-NEXT: s_mov_b32 s26, s28
; GFX11-NEXT: s_mov_b32 s27, s29
+; GFX11-NEXT: s_mov_b32 s28, s30
; GFX11-NEXT: s_mov_b32 s29, s31
; GFX11-NEXT: s_mov_b32 s31, s33
-; GFX11-NEXT: s_mov_b32 s0, s2
-; GFX11-NEXT: s_mov_b32 s2, s4
-; GFX11-NEXT: s_mov_b32 s4, s6
-; GFX11-NEXT: s_mov_b32 s6, s8
-; GFX11-NEXT: s_mov_b32 s8, s10
-; GFX11-NEXT: s_mov_b32 s10, s12
-; GFX11-NEXT: s_mov_b32 s12, s14
-; GFX11-NEXT: s_mov_b32 s14, s16
-; GFX11-NEXT: s_mov_b32 s16, s18
-; GFX11-NEXT: s_mov_b32 s18, s20
-; GFX11-NEXT: s_mov_b32 s20, s22
-; GFX11-NEXT: s_mov_b32 s22, s24
-; GFX11-NEXT: s_mov_b32 s24, s26
-; GFX11-NEXT: s_mov_b32 s26, s28
-; GFX11-NEXT: s_mov_b32 s28, s30
; GFX11-NEXT: s_mov_b32 s30, s32
-; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_lshl_b32 m0, s34, 1
+; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30
; GFX11-NEXT: v_dual_mov_b32 v31, s29 :: v_dual_mov_b32 v30, s28
; GFX11-NEXT: v_dual_mov_b32 v29, s27 :: v_dual_mov_b32 v28, s26
; GFX11-NEXT: v_dual_mov_b32 v27, s25 :: v_dual_mov_b32 v26, s24
@@ -5553,9 +5553,9 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: v_mov_b32_e32 v17, s15
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_lshl_b32 m0, s16, 1
+; GFX10-NEXT: v_mov_b32_e32 v17, s15
; GFX10-NEXT: v_mov_b32_e32 v16, s14
; GFX10-NEXT: v_mov_b32_e32 v15, s13
; GFX10-NEXT: v_mov_b32_e32 v14, s12
@@ -5604,9 +5604,9 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg
; GFX11-NEXT: s_mov_b32 s11, s13
; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s13, s15
-; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_lshl_b32 m0, s16, 1
+; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14
; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12
; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10
; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
@@ -6042,17 +6042,16 @@ entry:
define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
-; GPRIDX-NEXT: v_mov_b32_e32 v11, s9
+; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: v_mov_b32_e32 v3, s1
; GPRIDX-NEXT: v_mov_b32_e32 v2, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0
@@ -6071,6 +6070,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc
; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3
+; GPRIDX-NEXT: v_mov_b32_e32 v11, s9
; GPRIDX-NEXT: v_mov_b32_e32 v10, s8
; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
@@ -6091,18 +6091,18 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
;
; GFX10-LABEL: dyn_insertelement_v5f64_s_v_s:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
-; GFX10-NEXT: v_mov_b32_e32 v11, s9
+; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: v_mov_b32_e32 v10, s8
+; GFX10-NEXT: v_mov_b32_e32 v11, s9
; GFX10-NEXT: v_mov_b32_e32 v9, s7
; GFX10-NEXT: v_mov_b32_e32 v8, s6
; GFX10-NEXT: v_mov_b32_e32 v7, s5
@@ -6140,17 +6140,17 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
;
; GFX11-LABEL: dyn_insertelement_v5f64_s_v_s:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
-; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
@@ -6187,17 +6187,16 @@ entry:
define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_mov_b32 s1, s3
-; GPRIDX-NEXT: s_mov_b32 s3, s5
-; GPRIDX-NEXT: s_mov_b32 s5, s7
-; GPRIDX-NEXT: s_mov_b32 s7, s9
-; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s0, s2
+; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
+; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
+; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
+; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
-; GPRIDX-NEXT: v_mov_b32_e32 v12, s9
+; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: v_mov_b32_e32 v4, s1
; GPRIDX-NEXT: v_mov_b32_e32 v3, s0
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
@@ -6211,6 +6210,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
+; GPRIDX-NEXT: v_mov_b32_e32 v12, s9
; GPRIDX-NEXT: v_mov_b32_e32 v11, s8
; GPRIDX-NEXT: v_mov_b32_e32 v10, s7
; GPRIDX-NEXT: v_mov_b32_e32 v9, s6
@@ -6236,18 +6236,18 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
;
; GFX10-LABEL: dyn_insertelement_v5f64_s_v_v:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_mov_b32 s1, s3
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
+; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
+; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
+; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
-; GFX10-NEXT: v_mov_b32_e32 v12, s9
+; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: v_mov_b32_e32 v11, s8
+; GFX10-NEXT: v_mov_b32_e32 v12, s9
; GFX10-NEXT: v_mov_b32_e32 v10, s7
; GFX10-NEXT: v_mov_b32_e32 v9, s6
; GFX10-NEXT: v_mov_b32_e32 v8, s5
@@ -6285,17 +6285,17 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg
;
; GFX11-LABEL: dyn_insertelement_v5f64_s_v_v:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_mov_b32 s1, s3
-; GFX11-NEXT: s_mov_b32 s3, s5
-; GFX11-NEXT: s_mov_b32 s5, s7
-; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: s_mov_b32 s9, s11
; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
+; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
+; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
+; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: s_mov_b32 s8, s10
-; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
+; GFX11-NEXT: s_mov_b32 s9, s11
+; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 459cdbd9067e00..0c7cad39c33d24 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -379,10 +379,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -453,10 +453,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, v0, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -527,10 +527,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -601,10 +601,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1213,10 +1213,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1282,10 +1282,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
index 2c8b8126aa09a4..39292792bd7322 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
@@ -66,15 +66,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v9, v5
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s4
@@ -83,6 +81,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s8
; GFX9-NEXT: s_mov_b32 s7, s9
+; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: v_mov_b32_e32 v1, v6
; GFX9-NEXT: v_mov_b32_e32 v2, v7
; GFX9-NEXT: v_mov_b32_e32 v3, v8
@@ -190,15 +190,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v9, v5
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s4
@@ -207,6 +205,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s8
; GFX9-NEXT: s_mov_b32 s7, s9
+; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: v_mov_b32_e32 v1, v6
; GFX9-NEXT: v_mov_b32_e32 v2, v7
; GFX9-NEXT: v_mov_b32_e32 v3, v8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index 9652917e9028ee..ea7af4f29527ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -59,15 +59,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) {
; GFX9-LABEL: load_3d_v4f32_xyzw_tfe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: v_mov_b32_e32 v6, v2
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-NEXT: v_mov_b32_e32 v0, v7
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s4
@@ -76,6 +73,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s8
; GFX9-NEXT: s_mov_b32 s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v6, v2
+; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v7
; GFX9-NEXT: v_mov_b32_e32 v1, v8
; GFX9-NEXT: v_mov_b32_e32 v2, v9
; GFX9-NEXT: v_mov_b32_e32 v3, v10
@@ -127,13 +127,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX11-NEXT: v_mov_b32_e32 v10, v6
; GFX11-NEXT: v_mov_b32_e32 v12, v2
; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, v7
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
+; GFX11-NEXT: v_mov_b32_e32 v2, v8
; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -172,15 +173,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) {
; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: v_mov_b32_e32 v6, v2
-; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-NEXT: v_mov_b32_e32 v0, v7
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s4
@@ -189,6 +187,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s8
; GFX9-NEXT: s_mov_b32 s7, s9
+; GFX9-NEXT: v_mov_b32_e32 v6, v2
+; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v7
; GFX9-NEXT: v_mov_b32_e32 v1, v8
; GFX9-NEXT: v_mov_b32_e32 v2, v9
; GFX9-NEXT: v_mov_b32_e32 v3, v10
@@ -240,13 +241,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX11-NEXT: v_mov_b32_e32 v10, v6
; GFX11-NEXT: v_mov_b32_e32 v12, v2
; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, v7
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9
-; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8
+; GFX11-NEXT: v_mov_b32_e32 v2, v8
; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
index c1c383eb583aa7..48a854a71a0882 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
@@ -698,7 +698,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 %
define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, float %in) #0 {
; GFX6-LABEL: image_store_f32_dmask_1111:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v1, s10
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
; GFX6-NEXT: s_mov_b32 s2, s4
@@ -707,13 +706,13 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr
; GFX6-NEXT: s_mov_b32 s5, s7
; GFX6-NEXT: s_mov_b32 s6, s8
; GFX6-NEXT: s_mov_b32 s7, s9
+; GFX6-NEXT: v_mov_b32_e32 v1, s10
; GFX6-NEXT: v_mov_b32_e32 v2, s11
; GFX6-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf unorm
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: image_store_f32_dmask_1111:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v1, s10
; GFX8-NEXT: s_mov_b32 s0, s2
; GFX8-NEXT: s_mov_b32 s1, s3
; GFX8-NEXT: s_mov_b32 s2, s4
@@ -722,6 +721,7 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr
; GFX8-NEXT: s_mov_b32 s5, s7
; GFX8-NEXT: s_mov_b32 s6, s8
; GFX8-NEXT: s_mov_b32 s7, s9
+; GFX8-NEXT: v_mov_b32_e32 v1, s10
; GFX8-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf unorm
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index b26ddbdd7a342e..316fbe5b78610f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -693,28 +693,29 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
; GFX11-NEXT: s_mov_b32 s9, 4.0
; GFX11-NEXT: s_mov_b32 s8, 0x40400000
-; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX11-NEXT: s_mov_b32 s14, 0x41000000
; GFX11-NEXT: s_mov_b32 s13, 0x40e00000
-; GFX11-NEXT: v_mov_b32_e32 v6, s12
-; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
+; GFX11-NEXT: v_mov_b32_e32 v8, s14
+; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT: v_mov_b32_e32 v7, s13
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX11-NEXT: s_mov_b32 s2, 2.0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_mov_b32 s1, 1.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v9, v[0:1]
; GFX11-NEXT: flat_load_b32 v10, v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: v_mov_b32_e32 v4, s9
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -801,24 +802,25 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX11-NEXT: s_mov_b32 s8, 0x42004600
; GFX11-NEXT: s_mov_b32 s9, 0x44004700
; GFX11-NEXT: s_mov_b32 s10, 0x45004800
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v5, s10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX11-NEXT: s_mov_b32 s2, 2.0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_mov_b32 s1, 1.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: v_mov_b32_e32 v4, s9
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -881,28 +883,28 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_mov_b32 s5, 1.0
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7
-; GFX11-NEXT: s_mov_b32 s8, 0x40400000
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
; GFX11-NEXT: s_mov_b32 s9, 4.0
+; GFX11-NEXT: s_mov_b32 s8, 0x40400000
; GFX11-NEXT: s_mov_b32 s14, 0x41000000
; GFX11-NEXT: s_mov_b32 s13, 0x40e00000
+; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
+; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
; GFX11-NEXT: v_mov_b32_e32 v6, s12
+; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, 0xb36211c7
; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0
-; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT: v_mov_b32_e32 v5, s10
+; GFX11-NEXT: v_mov_b32_e32 v3, s8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: v_mov_b32_e32 v1, s7
+; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_mov_b32 s6, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -962,21 +964,21 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: s_mov_b32 s8, 0x42004600
; GFX11-NEXT: s_mov_b32 s9, 0x44004700
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_mov_b32 s10, 0x45004800
-; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10
; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0
-; GFX11-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
+; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v3, s8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_mov_b32 s6, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index 76e56d91e6d8c1..3a3d44be97b9d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -52,8 +52,9 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 41f57bb23a45f6..7b222b7a21a771 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -10,10 +10,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: s_nop 1
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 31526bcfead4e7..5d51a5597bf06d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -98,8 +98,9 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_co_u32 s0, s2, 0
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
@@ -136,8 +137,9 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_co_u32 s0, s2, 4
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297
@@ -311,12 +313,12 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_ashr_i32 s5, s4, 31
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
@@ -325,12 +327,12 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_ashr_i32 s5, s4, 31
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
@@ -909,11 +911,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_ashr_i32 s5, s4, 31
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -923,11 +925,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
; GFX7: ; %bb.0:
; GFX7-NEXT: s_ashr_i32 s5, s4, 31
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1509,8 +1511,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_co_u32 s0, s2, 0
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4
-; GFX12-NEXT: v_mov_b32_e32 v2, v0
-; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s0
+; GFX12-NEXT: v_mov_b32_e32 v4, s1
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1643,9 +1645,9 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
;
; GFX12-LABEL: mubuf_cmpxchg_sgpr_ptr_vgpr_offset:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v3, v1
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: v_mov_b32_e32 v5, s3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index ba2af13338be6a..f21bf7561417fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2793,8 +2793,8 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: s_mulk_i32 s3, 0x50
+; GFX8-NEXT: s_mulk_i32 s2, 0x50
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_add_u32 s3, s3, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index 191739b37672e2..bea14d9a058128 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -571,8 +571,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; NEW_RBS-NEXT: s_ashr_i32 s1, s0, 31
; NEW_RBS-NEXT: s_lshl_b64 s[2:3], s[0:1], 2
; NEW_RBS-NEXT: s_andn2_b32 s1, s5, exec_lo
-; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3
; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2
+; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3
; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, exec_lo
; NEW_RBS-NEXT: s_or_b32 s5, s1, s5
; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6
@@ -584,8 +584,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; NEW_RBS-NEXT: s_cbranch_execz .LBB16_2
; NEW_RBS-NEXT: ; %bb.4: ; %B
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
-; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3
; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2
+; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3
; NEW_RBS-NEXT: s_mov_b32 s6, exec_lo
; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6
; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo
@@ -596,8 +596,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1
; NEW_RBS-NEXT: ; %bb.5: ; %loop.body
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
-; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3
; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2
+; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3
; NEW_RBS-NEXT: s_add_i32 s2, s0, 1
; NEW_RBS-NEXT: s_cmpk_lt_u32 s0, 0x64
; NEW_RBS-NEXT: s_cselect_b32 s0, exec_lo, 0
@@ -605,8 +605,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
; NEW_RBS-NEXT: s_andn2_b32 s3, s6, exec_lo
; NEW_RBS-NEXT: s_and_b32 s0, exec_lo, s0
-; NEW_RBS-NEXT: s_or_b32 s6, s3, s0
; NEW_RBS-NEXT: global_load_dword v8, v[6:7], off
+; NEW_RBS-NEXT: s_or_b32 s6, s3, s0
; NEW_RBS-NEXT: s_mov_b32 s0, s2
; NEW_RBS-NEXT: s_waitcnt vmcnt(0)
; NEW_RBS-NEXT: v_add_nc_u32_e32 v8, 1, v8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 08184e700c1a44..af7b16c8ca9d35 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4193,8 +4193,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-NEXT: s_add_u32 s4, s0, s2
; GFX6-NEXT: s_addc_u32 s3, s1, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
@@ -4218,8 +4218,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: s_add_u32 s4, s0, s2
; GFX8-NEXT: s_addc_u32 s3, s1, s3
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
@@ -4243,8 +4243,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX9-NEXT: s_add_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -4571,8 +4571,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_saddsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s4, s0, s2
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_addc_u32 s5, s1, s3
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -4592,8 +4592,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-LABEL: s_saddsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s5, s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -4613,8 +4613,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-LABEL: s_saddsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -4934,23 +4934,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-LABEL: s_saddsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s8, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_addc_u32 s9, s1, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX6-NEXT: s_ashr_i32 s4, s9, 31
; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
+; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT: s_add_u32 s0, s2, s6
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX6-NEXT: s_add_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_addc_u32 s1, s3, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
@@ -4972,23 +4972,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-LABEL: s_saddsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s8, s0, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s9, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX8-NEXT: s_ashr_i32 s4, s9, 31
; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
+; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT: s_add_u32 s0, s2, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT: s_add_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_addc_u32 s1, s3, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
@@ -5010,23 +5010,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-LABEL: s_saddsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s8, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s9, s1, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s4, s9, 31
; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT: s_add_u32 s0, s2, s6
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX9-NEXT: s_add_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_addc_u32 s1, s3, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
@@ -5110,13 +5110,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_saddsat_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s4, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_addc_u32 s5, s1, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_addc_u32 s8, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: s_addc_u32 s9, s3, s7
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
@@ -5152,12 +5152,12 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s4
; GFX8-NEXT: s_addc_u32 s5, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s8, s2, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_addc_u32 s9, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
@@ -5199,12 +5199,12 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s4
; GFX9-NEXT: s_addc_u32 s5, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s8, s2, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_addc_u32 s9, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
@@ -5900,13 +5900,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-LABEL: s_saddsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s8, s0, s8
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_addc_u32 s9, s1, s9
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_addc_u32 s16, s2, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: s_addc_u32 s17, s3, s11
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
@@ -5930,16 +5930,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_mov_b32_e32 v0, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NEXT: v_mov_b32_e32 v3, s17
+; GFX6-NEXT: s_add_u32 s0, s4, s12
; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX6-NEXT: s_add_u32 s0, s4, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_addc_u32 s1, s5, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: s_addc_u32 s2, s6, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: s_addc_u32 s3, s7, s15
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_mov_b32_e32 v3, s7
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
@@ -5979,12 +5979,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s8, s0, s8
; GFX8-NEXT: s_addc_u32 s9, s1, s9
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s16, s2, s10
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_addc_u32 s17, s3, s11
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
@@ -6003,27 +6003,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s17, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: s_add_u32 s0, s4, s12
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: v_mov_b32_e32 v3, s17
-; GFX8-NEXT: s_add_u32 s0, s4, s12
+; GFX8-NEXT: s_addc_u32 s1, s5, s13
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX8-NEXT: s_addc_u32 s1, s5, s13
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_addc_u32 s2, s6, s14
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s3, s7, s15
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
; GFX8-NEXT: s_cselect_b32 s4, 1, 0
@@ -6069,12 +6069,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s8, s0, s8
; GFX9-NEXT: s_addc_u32 s9, s1, s9
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s16, s2, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_addc_u32 s17, s3, s11
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
@@ -6093,27 +6093,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s17, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_add_u32 s0, s4, s12
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: v_mov_b32_e32 v3, s17
-; GFX9-NEXT: s_add_u32 s0, s4, s12
+; GFX9-NEXT: s_addc_u32 s1, s5, s13
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX9-NEXT: s_addc_u32 s1, s5, s13
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_addc_u32 s2, s6, s14
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_addc_u32 s3, s7, s15
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index d41601cc0d76e4..8875c826398e16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -39,13 +39,13 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -633,6 +633,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: s_sub_i32 s1, 0, s11
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX8-NEXT: s_add_i32 s0, s8, s12
; GFX8-NEXT: s_xor_b32 s0, s0, s12
@@ -679,13 +680,12 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: s_xor_b32 s0, s2, s10
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
@@ -866,6 +866,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0
; GFX8-NEXT: s_sub_i32 s11, 0, s10
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
@@ -970,15 +971,15 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc
; GFX8-NEXT: s_xor_b32 s0, s3, s2
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc
; GFX8-NEXT: v_xor_b32_e32 v3, s0, v7
-; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8
-; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3
+; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s4
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s3, v7
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
@@ -2221,13 +2222,13 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_byte v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -2401,12 +2402,12 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3
; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_short v[0:1], v4
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3
@@ -2630,13 +2631,13 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -3036,15 +3037,15 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_and_b32_e32 v2, 7, v3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -3187,15 +3188,15 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2
; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index be8cb232931766..f7795a1dfbb7d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -257,8 +257,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; GFX8-NEXT: flat_store_dword v[2:3], v1
@@ -272,8 +272,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
; GFX9-NEXT: global_store_dword v[2:3], v1, off
@@ -287,8 +287,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v4, s0
+; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
; GFX10-NEXT: global_store_dword v[2:3], v1, off
@@ -347,9 +347,9 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dword v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 94f943af2532ab..5565ab41c9f4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4199,8 +4199,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-NEXT: s_sub_u32 s4, s0, s2
; GFX6-NEXT: s_subb_u32 s3, s1, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
@@ -4224,8 +4224,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: s_sub_u32 s4, s0, s2
; GFX8-NEXT: s_subb_u32 s3, s1, s3
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
@@ -4249,8 +4249,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -4577,8 +4577,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_ssubsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s4, s0, s2
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_subb_u32 s5, s1, s3
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -4598,8 +4598,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-LABEL: s_ssubsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s5, s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -4619,8 +4619,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-LABEL: s_ssubsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -4940,23 +4940,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-LABEL: s_ssubsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_subb_u32 s9, s1, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX6-NEXT: s_ashr_i32 s4, s9, 31
; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
+; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT: s_sub_u32 s0, s2, s6
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX6-NEXT: s_sub_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_subb_u32 s1, s3, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
@@ -4978,23 +4978,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-LABEL: s_ssubsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s9, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX8-NEXT: s_ashr_i32 s4, s9, 31
; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
+; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT: s_sub_u32 s0, s2, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT: s_sub_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_subb_u32 s1, s3, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
@@ -5016,23 +5016,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-LABEL: s_ssubsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s9, s1, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s4, s9, 31
; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT: s_sub_u32 s0, s2, s6
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX9-NEXT: s_sub_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_subb_u32 s1, s3, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
@@ -5116,13 +5116,13 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_ssubsat_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_subb_u32 s9, s1, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_subb_u32 s10, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: s_subb_u32 s11, s3, s7
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
@@ -5160,12 +5160,12 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s8, s0, s4
; GFX8-NEXT: s_subb_u32 s9, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s10, s2, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_subb_u32 s11, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
@@ -5209,12 +5209,12 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s8, s0, s4
; GFX9-NEXT: s_subb_u32 s9, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s10, s2, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_subb_u32 s11, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
@@ -5954,13 +5954,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-LABEL: s_ssubsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s16, s0, s8
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_subb_u32 s17, s1, s9
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_subb_u32 s18, s2, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
; GFX6-NEXT: s_subb_u32 s19, s3, s11
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
@@ -5986,16 +5986,16 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_mov_b32_e32 v0, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s18
; GFX6-NEXT: v_mov_b32_e32 v3, s19
+; GFX6-NEXT: s_sub_u32 s0, s4, s12
; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX6-NEXT: s_sub_u32 s0, s4, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_subb_u32 s1, s5, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: s_subb_u32 s2, s6, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: s_subb_u32 s3, s7, s15
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: v_mov_b32_e32 v3, s7
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
@@ -6037,12 +6037,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s16, s0, s8
; GFX8-NEXT: s_subb_u32 s17, s1, s9
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s18, s2, s10
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_subb_u32 s19, s3, s11
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
@@ -6063,27 +6063,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s19, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: v_mov_b32_e32 v3, s17
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: s_sub_u32 s0, s4, s12
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: v_mov_b32_e32 v3, s19
-; GFX8-NEXT: s_sub_u32 s0, s4, s12
+; GFX8-NEXT: s_subb_u32 s1, s5, s13
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX8-NEXT: s_subb_u32 s1, s5, s13
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_subb_u32 s2, s6, s14
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_subb_u32 s3, s7, s15
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
; GFX8-NEXT: s_cselect_b32 s4, 1, 0
@@ -6131,12 +6131,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s16, s0, s8
; GFX9-NEXT: s_subb_u32 s17, s1, s9
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s18, s2, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_subb_u32 s19, s3, s11
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
@@ -6157,27 +6157,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s19, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: v_mov_b32_e32 v3, s17
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_sub_u32 s0, s4, s12
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: s_sub_u32 s0, s4, s12
+; GFX9-NEXT: s_subb_u32 s1, s5, s13
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX9-NEXT: s_subb_u32 s1, s5, s13
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_subb_u32 s2, s6, s14
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_subb_u32 s3, s7, s15
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index 3741983a3067b8..b09183c00309e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -679,8 +679,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_ssubo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_sub_u32 s4, s0, s2
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_subb_u32 s5, s1, s3
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX7-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -696,8 +696,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX8-LABEL: s_ssubo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s5, s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -713,8 +713,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX9-LABEL: s_ssubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index e3c1a52696b47c..4d141f9759234a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -32,9 +32,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -527,6 +527,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX8-NEXT: s_sub_i32 s0, 0, s10
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -567,7 +568,6 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
@@ -694,6 +694,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s18
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -772,7 +773,6 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s19, v8
; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -1235,9 +1235,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_mov_b32_e32 v9, s4
; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1]
@@ -1799,9 +1799,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_byte v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -2108,9 +2108,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
@@ -2414,11 +2414,11 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_and_b32_e32 v2, 7, v3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2532,11 +2532,11 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 6730df000e3b8c..aa550933570de9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -337,14 +337,14 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_add_u32 s2, s0, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
index 4959e10d2a18db..cdc9fe42eff45e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -20,15 +20,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
@@ -62,15 +62,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7
; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
@@ -102,10 +102,10 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
@@ -122,10 +122,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
@@ -142,10 +142,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3
; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
@@ -176,15 +176,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
@@ -218,15 +218,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7
; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
@@ -260,15 +260,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
@@ -302,15 +302,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
@@ -344,15 +344,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
@@ -386,15 +386,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
@@ -428,15 +428,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
+; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7
; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 173dd011f4d67f..2612b18059289b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -18,11 +18,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v9, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v8, s2
+; GFX12-NEXT: v_mov_b32_e32 v9, s3
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -52,11 +52,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v9, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v8, s2
+; GFX12-NEXT: v_mov_b32_e32 v9, s3
; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -85,11 +85,10 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half>
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x42004200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_endpgm
@@ -103,11 +102,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16>
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_endpgm
@@ -121,11 +119,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16>
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_mov_b32 s1, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v7, s1
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_endpgm
@@ -152,11 +149,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -186,11 +183,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -220,11 +217,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -254,11 +251,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -288,11 +285,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -322,11 +319,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_mov_b32 s0, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -356,11 +353,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_movk_i32 s0, 0x80
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
-; GFX12-NEXT: v_mov_b32_e32 v7, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
+; GFX12-NEXT: v_mov_b32_e32 v7, s3
; GFX12-NEXT: v_mov_b32_e32 v5, s1
; GFX12-NEXT: v_mov_b32_e32 v4, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 194a23fa0d4a96..15672721aa7067 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -239,8 +239,8 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
-; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
+; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index e9a8248ef4e900..50e7b534ac1a8f 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -83,21 +83,21 @@ body: |
; GFX908-LABEL: name: a2_to_v2
; GFX908: liveins: $agpr0_agpr1
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
+ ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
;
; GFX90A-LABEL: name: a2_to_v2
; GFX90A: liveins: $agpr0_agpr1
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
+ ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
;
; GFX940-LABEL: name: a2_to_v2
; GFX940: liveins: $agpr0_agpr1
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
+ ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
$vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec
@@ -114,7 +114,7 @@ body: |
; GFX908-LABEL: name: a3_to_v3
; GFX908: liveins: $agpr0_agpr1_agpr2
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2
+ ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
@@ -122,7 +122,7 @@ body: |
; GFX90A-LABEL: name: a3_to_v3
; GFX90A: liveins: $agpr0_agpr1_agpr2
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2
+ ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
@@ -130,7 +130,7 @@ body: |
; GFX940-LABEL: name: a3_to_v3
; GFX940: liveins: $agpr0_agpr1_agpr2
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2
+ ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
@@ -147,7 +147,7 @@ body: |
; GFX908-LABEL: name: a4_to_v4
; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -156,7 +156,7 @@ body: |
; GFX90A-LABEL: name: a4_to_v4
; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -165,7 +165,7 @@ body: |
; GFX940-LABEL: name: a4_to_v4
; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -184,7 +184,7 @@ body: |
; GFX908-LABEL: name: a8_to_v8
; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -197,7 +197,7 @@ body: |
; GFX90A-LABEL: name: a8_to_v8
; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -210,7 +210,7 @@ body: |
; GFX940-LABEL: name: a8_to_v8
; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -232,7 +232,7 @@ body: |
; GFX908-LABEL: name: a16_to_v16
; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -253,7 +253,7 @@ body: |
; GFX90A-LABEL: name: a16_to_v16
; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -274,7 +274,7 @@ body: |
; GFX940-LABEL: name: a16_to_v16
; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -331,21 +331,21 @@ body: |
; GFX908-LABEL: name: v2_to_a2
; GFX908: liveins: $vgpr0_vgpr1
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
;
; GFX90A-LABEL: name: v2_to_a2
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
;
; GFX940-LABEL: name: v2_to_a2
; GFX940: liveins: $vgpr0_vgpr1
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
$agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec
@@ -361,7 +361,7 @@ body: |
; GFX908-LABEL: name: v3_to_a3
; GFX908: liveins: $vgpr0_vgpr1_vgpr2
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -369,7 +369,7 @@ body: |
; GFX90A-LABEL: name: v3_to_a3
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -377,7 +377,7 @@ body: |
; GFX940-LABEL: name: v3_to_a3
; GFX940: liveins: $vgpr0_vgpr1_vgpr2
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -394,7 +394,7 @@ body: |
; GFX908-LABEL: name: v4_to_a4
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
@@ -403,7 +403,7 @@ body: |
; GFX90A-LABEL: name: v4_to_a4
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
@@ -412,7 +412,7 @@ body: |
; GFX940-LABEL: name: v4_to_a4
; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
@@ -430,7 +430,7 @@ body: |
; GFX908-LABEL: name: v8_to_a8
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@@ -443,7 +443,7 @@ body: |
; GFX90A-LABEL: name: v8_to_a8
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@@ -456,7 +456,7 @@ body: |
; GFX940-LABEL: name: v8_to_a8
; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@@ -478,7 +478,7 @@ body: |
; GFX908-LABEL: name: v16_to_a16
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -499,7 +499,7 @@ body: |
; GFX90A-LABEL: name: v16_to_a16
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -520,7 +520,7 @@ body: |
; GFX940-LABEL: name: v16_to_a16
; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -579,7 +579,7 @@ body: |
; GFX908: liveins: $sgpr0_sgpr1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
@@ -587,14 +587,14 @@ body: |
; GFX90A-LABEL: name: s2_to_a2
; GFX90A: liveins: $sgpr0_sgpr1
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
;
; GFX940-LABEL: name: s2_to_a2
; GFX940: liveins: $sgpr0_sgpr1
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
$agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec
@@ -611,7 +611,7 @@ body: |
; GFX908: liveins: $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2
@@ -621,7 +621,7 @@ body: |
; GFX90A-LABEL: name: s3_to_a3
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -629,7 +629,7 @@ body: |
; GFX940-LABEL: name: s3_to_a3
; GFX940: liveins: $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -647,7 +647,7 @@ body: |
; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -659,7 +659,7 @@ body: |
; GFX90A-LABEL: name: s4_to_a4
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -668,7 +668,7 @@ body: |
; GFX940-LABEL: name: s4_to_a4
; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -687,7 +687,7 @@ body: |
; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
@@ -703,7 +703,7 @@ body: |
; GFX90A-LABEL: name: s6_to_a6
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
@@ -714,7 +714,7 @@ body: |
; GFX940-LABEL: name: s6_to_a6
; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
@@ -735,7 +735,7 @@ body: |
; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -755,7 +755,7 @@ body: |
; GFX90A-LABEL: name: s8_to_a8
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -768,7 +768,7 @@ body: |
; GFX940-LABEL: name: s8_to_a8
; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -791,7 +791,7 @@ body: |
; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -827,7 +827,7 @@ body: |
; GFX90A-LABEL: name: s16_to_a16
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -848,7 +848,7 @@ body: |
; GFX940-LABEL: name: s16_to_a16
; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX940-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -904,7 +904,7 @@ body: |
; GFX908: liveins: $agpr0_agpr1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
- ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2
+ ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
@@ -913,7 +913,7 @@ body: |
; GFX90A-LABEL: name: a2_to_a2
; GFX90A: liveins: $agpr0_agpr1
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1
+ ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
@@ -921,7 +921,7 @@ body: |
; GFX940-LABEL: name: a2_to_a2
; GFX940: liveins: $agpr0_agpr1
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1
+ ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1
; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec
; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
@@ -940,7 +940,7 @@ body: |
; GFX908: liveins: $agpr0_agpr1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
- ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2
+ ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
@@ -949,7 +949,7 @@ body: |
; GFX90A-LABEL: name: a2_to_a2_kill
; GFX90A: liveins: $agpr0_agpr1
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1
+ ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
@@ -957,7 +957,7 @@ body: |
; GFX940-LABEL: name: a2_to_a2_kill
; GFX940: liveins: $agpr0_agpr1
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1
+ ; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1
; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec
; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
@@ -980,7 +980,7 @@ body: |
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr1_agpr2
- ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr3_agpr4
+ ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr1_agpr2
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
;
@@ -991,7 +991,7 @@ body: |
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2
; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2
- ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2
+ ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec
;
; GFX940-LABEL: name: a2_to_a2_implicit_defs
@@ -1001,7 +1001,7 @@ body: |
; GFX940-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2
; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX940-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2
- ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2
+ ; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2
; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec
$vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
$agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2
@@ -1020,7 +1020,7 @@ body: |
; GFX908: liveins: $agpr4_agpr5_agpr6
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
@@ -1030,7 +1030,7 @@ body: |
; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill
; GFX90A: liveins: $agpr4_agpr5_agpr6
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -1038,7 +1038,7 @@ body: |
; GFX940-LABEL: name: a3_to_a3_nonoverlap_kill
; GFX940: liveins: $agpr4_agpr5_agpr6
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6
; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
; GFX940-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
@@ -1056,7 +1056,7 @@ body: |
; GFX908: liveins: $agpr1_agpr2_agpr3
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3
- ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2
+ ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
@@ -1067,7 +1067,7 @@ body: |
; GFX90A-LABEL: name: a3_to_a3_overlap_kill
; GFX90A: liveins: $agpr1_agpr2_agpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3
+ ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
@@ -1076,7 +1076,7 @@ body: |
; GFX940-LABEL: name: a3_to_a3_overlap_kill
; GFX940: liveins: $agpr1_agpr2_agpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3
+ ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
@@ -1094,7 +1094,7 @@ body: |
; GFX908-LABEL: name: a4_to_a4
; GFX908: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1105,7 +1105,7 @@ body: |
;
; GFX90A-LABEL: name: a4_to_a4
; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
- ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1113,7 +1113,7 @@ body: |
;
; GFX940-LABEL: name: a4_to_a4
; GFX940: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
- ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1133,7 +1133,7 @@ body: |
; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5
+ ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1145,7 +1145,7 @@ body: |
; GFX90A-LABEL: name: a4_to_a4_overlap
; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1154,7 +1154,7 @@ body: |
; GFX940-LABEL: name: a4_to_a4_overlap
; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1171,7 +1171,7 @@ body: |
; GFX908-LABEL: name: a8_to_a8
; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
- ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1190,7 +1190,7 @@ body: |
;
; GFX90A-LABEL: name: a8_to_a8
; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
- ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX90A-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1202,7 +1202,7 @@ body: |
;
; GFX940-LABEL: name: a8_to_a8
; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
- ; GFX940-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; GFX940-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
; GFX940-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1225,7 +1225,7 @@ body: |
; GFX908-LABEL: name: a16_to_a16
; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
- ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX908-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1260,7 +1260,7 @@ body: |
;
; GFX90A-LABEL: name: a16_to_a16
; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
- ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX90A-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1280,7 +1280,7 @@ body: |
;
; GFX940-LABEL: name: a16_to_a16
; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
- ; GFX940-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX940-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
; GFX940-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1349,7 +1349,7 @@ body: |
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+ ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -1362,7 +1362,7 @@ body: |
; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1
- ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -1372,7 +1372,7 @@ body: |
; GFX940: liveins: $agpr0, $sgpr2_sgpr3
; GFX940-NEXT: {{ $}}
; GFX940-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1
- ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -1393,7 +1393,7 @@ body: |
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1
; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+ ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
@@ -1406,7 +1406,7 @@ body: |
; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1
- ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -1416,7 +1416,7 @@ body: |
; GFX940: liveins: $agpr0, $sgpr2_sgpr3
; GFX940-NEXT: {{ $}}
; GFX940-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1
- ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX940-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX940-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
@@ -1438,7 +1438,7 @@ body: |
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+ ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1451,7 +1451,7 @@ body: |
; GFX90A: liveins: $agpr0, $agpr2_agpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1
- ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1461,7 +1461,7 @@ body: |
; GFX940: liveins: $agpr0, $agpr2_agpr3
; GFX940-NEXT: {{ $}}
; GFX940-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1
- ; GFX940-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX940-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1483,7 +1483,7 @@ body: |
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7
+ ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1496,7 +1496,7 @@ body: |
; GFX90A: liveins: $agpr0, $agpr2_agpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1
- ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -1506,7 +1506,7 @@ body: |
; GFX940: liveins: $agpr0, $agpr2_agpr3
; GFX940-NEXT: {{ $}}
; GFX940-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1
- ; GFX940-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX940-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX940-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index cd5b585a8c4e23..ad0ac222cd599f 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -344,9 +344,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -1021,8 +1021,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
@@ -1033,8 +1034,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1123,8 +1125,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
@@ -1137,8 +1140,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1166,10 +1170,10 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX6-NEXT: .LBB9_2: ; %if
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX6-NEXT: .LBB9_3: ; %endif
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -1192,10 +1196,10 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: .LBB9_2: ; %if
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX8-NEXT: .LBB9_3: ; %endif
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
@@ -1218,9 +1222,9 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: .LBB9_2: ; %if
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0
; GFX9-NEXT: .LBB9_3: ; %endif
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: s_endpgm
@@ -1241,9 +1245,9 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX10-NEXT: .LBB9_2: ; %if
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0
; GFX10-NEXT: .LBB9_3: ; %endif
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
@@ -1265,8 +1269,8 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX11-NEXT: .LBB9_3: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB9_4:
@@ -1286,8 +1290,8 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-NEXT: .LBB9_3: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
; GFX12-NEXT: .LBB9_4:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 4ce46bbaf45ac1..67a82fb62ae5d9 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -574,15 +574,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_mov_b32 s7, s6
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
; GFX908-NEXT: v_mov_b32_e32 v4, s6
+; GFX908-NEXT: v_mov_b32_e32 v5, s7
; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
; GFX908-NEXT: v_mov_b32_e32 v6, s6
-; GFX908-NEXT: v_mov_b32_e32 v9, s7
-; GFX908-NEXT: v_mov_b32_e32 v5, s7
; GFX908-NEXT: v_mov_b32_e32 v7, s7
+; GFX908-NEXT: v_mov_b32_e32 v9, s7
; GFX908-NEXT: v_mov_b32_e32 v8, s6
; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
+; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: v_mov_b32_e32 v10, v4
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s7, v2
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
index 950382758ffbc5..a225b39f63231e 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
@@ -35,7 +35,7 @@ body: |
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1
- ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec, implicit-def $agpr2_agpr3
+ ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
@@ -43,7 +43,7 @@ body: |
; GFX90A-LABEL: name: no_free_vgprs_for_copy_a64_to_a64
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr2_agpr3, implicit $agpr0_agpr1
+ ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1
; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1
; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
$agpr2_agpr3 = COPY $agpr0_agpr1
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir
index 683a89061ddda3..6e1131641da807 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir
@@ -14,7 +14,7 @@ body: |
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
- ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+ ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -46,7 +46,7 @@ body: |
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1
; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
- ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3, implicit $agpr0_agpr1
+ ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1, implicit $exec
; GFX908-NEXT: S_ENDPGM 0
$agpr0_agpr1 = IMPLICIT_DEF
@@ -66,7 +66,7 @@ body: |
; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
- ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr1_agpr2_agpr3_agpr4
+ ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
index a9d31c1c45b0e5..3e5793dc6c0ac4 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
@@ -36,7 +36,7 @@ body: |
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9
- ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec, implicit-def $agpr2_agpr3
+ ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9
; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
@@ -44,7 +44,7 @@ body: |
; GFX90A-LABEL: name: no_free_vgprs_for_copy_s64_to_a64
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit-def $agpr2_agpr3, implicit $sgpr8_sgpr9
+ ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9
; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9
; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
$agpr2_agpr3 = COPY $sgpr8_sgpr9
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 8ca3e8255b6340..81b94aab554179 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -54,9 +54,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, s1
; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, s2
; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, s3
-; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; GISEL-GFX10-NEXT: s_mov_b32 s4, use at abs32@lo
; GISEL-GFX10-NEXT: s_mov_b32 s5, use at abs32@hi
+; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL-GFX10-NEXT: s_mov_b32 s32, 0
; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -86,9 +86,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, s1
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, s2
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, s3
-; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use at abs32@hi
; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use at abs32@lo
+; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0
; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
@@ -219,9 +219,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v29, v37
; GISEL-GFX10-NEXT: v_mov_b32_e32 v30, v38
; GISEL-GFX10-NEXT: v_mov_b32_e32 v31, v39
-; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; GISEL-GFX10-NEXT: s_mov_b32 s24, use at abs32@lo
; GISEL-GFX10-NEXT: s_mov_b32 s25, use at abs32@hi
+; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[24:25]
; GISEL-GFX10-NEXT: s_endpgm
@@ -347,9 +347,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v29, v34
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v30, v33
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v31, v32
-; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; DAGISEL-GFX10-NEXT: s_mov_b32 s25, use at abs32@hi
; DAGISEL-GFX10-NEXT: s_mov_b32 s24, use at abs32@lo
+; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[24:25]
; DAGISEL-GFX10-NEXT: s_endpgm
@@ -374,9 +374,9 @@ define amdgpu_cs_chain void @alloca_and_call() {
; GISEL-GFX10: ; %bb.0: ; %.entry
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42
-; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; GISEL-GFX10-NEXT: s_mov_b32 s4, use at abs32@lo
; GISEL-GFX10-NEXT: s_mov_b32 s5, use at abs32@hi
+; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -400,9 +400,9 @@ define amdgpu_cs_chain void @alloca_and_call() {
; DAGISEL-GFX10: ; %bb.0: ; %.entry
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42
-; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use at abs32@hi
; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use at abs32@lo
+; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49]
; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -449,10 +449,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
; GISEL-GFX10-NEXT: ;;#ASMSTART
; GISEL-GFX10-NEXT: s_nop
; GISEL-GFX10-NEXT: ;;#ASMEND
-; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3
-; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
+; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; GISEL-GFX10-NEXT: s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103]
; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
@@ -489,10 +489,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
; DAGISEL-GFX10-NEXT: ;;#ASMEND
-; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3
-; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
+; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103]
+; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3
; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
@@ -530,10 +530,10 @@ define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) {
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX10-NEXT: s_bitset0_b32 s103, 21
; GISEL-GFX10-NEXT: s_add_u32 s100, s100, s0
-; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; GISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
+; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103]
+; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
;
@@ -564,10 +564,10 @@ define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) {
; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL-GFX10-NEXT: s_bitset0_b32 s103, 21
; DAGISEL-GFX10-NEXT: s_add_u32 s100, s100, s0
-; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; DAGISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
+; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103]
+; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0)
@@ -934,11 +934,11 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
; GISEL-GFX11-NEXT: s_mov_b32 s2, 3
; GISEL-GFX11-NEXT: s_mov_b32 s1, 2
; GISEL-GFX11-NEXT: s_mov_b32 s0, 1
-; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 4, v8
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 4ba9f0729ea1f3..f589e66be2432b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -60,10 +60,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
; GISEL-GFX10-NEXT: ;;#ASMSTART
; GISEL-GFX10-NEXT: s_nop
; GISEL-GFX10-NEXT: ;;#ASMEND
-; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3
-; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
+; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; GISEL-GFX10-NEXT: s_mov_b32 s0, s3
+; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103]
; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
@@ -100,10 +100,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
; DAGISEL-GFX10-NEXT: ;;#ASMEND
-; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3
-; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
+; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0
; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103]
+; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101]
; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3
; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1
; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5]
@@ -587,11 +587,11 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
; GISEL-GFX11-NEXT: s_mov_b32 s2, 3
; GISEL-GFX11-NEXT: s_mov_b32 s1, 2
; GISEL-GFX11-NEXT: s_mov_b32 s0, 1
-; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 4, v8
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, v0
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 6166c05c6f8959..77881b7f361e90 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2383,12 +2383,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7
; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
@@ -2436,12 +2436,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7
; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
@@ -2862,12 +2862,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s10, -1
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: s_mov_b32 s8, s2
; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s7
; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc
; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
@@ -2945,12 +2945,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s10, -1
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: s_mov_b32 s8, s2
; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7
; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc
; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
@@ -5969,12 +5969,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7
; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
@@ -6022,12 +6022,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7
; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc
; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
@@ -6448,12 +6448,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s10, -1
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: s_mov_b32 s8, s2
; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s7
; GFX8_DPP-NEXT: buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc
; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
@@ -6531,12 +6531,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s10, -1
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: s_mov_b32 s8, s2
; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7
; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc
; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 8062dbbca73932..7348bd6a34fdc0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2044,8 +2044,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
@@ -2093,8 +2093,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2140,8 +2140,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2187,8 +2187,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2241,8 +2241,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2904,8 +2904,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
@@ -2937,8 +2937,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2969,8 +2969,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -3001,8 +3001,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -3039,8 +3039,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1
; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -3075,8 +3075,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4
; GFX1132_ITERATIVE-NEXT: ; %bb.3:
-; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0
; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1]
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
@@ -5489,8 +5489,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
@@ -5538,8 +5538,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -5585,8 +5585,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -5632,8 +5632,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -5686,8 +5686,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -6975,8 +6975,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
@@ -7022,8 +7022,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -7067,8 +7067,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -7113,8 +7113,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -7164,8 +7164,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -8339,8 +8339,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
@@ -8386,8 +8386,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -8431,8 +8431,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -8477,8 +8477,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -8528,8 +8528,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -9703,8 +9703,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
@@ -9750,8 +9750,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -9795,8 +9795,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -9841,8 +9841,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -9892,8 +9892,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11316,8 +11316,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
@@ -11372,8 +11372,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11424,8 +11424,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11475,8 +11475,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11534,8 +11534,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13149,8 +13149,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
@@ -13205,8 +13205,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13257,8 +13257,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13308,8 +13308,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13367,8 +13367,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -14977,8 +14977,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
@@ -15032,8 +15032,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15083,8 +15083,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15133,8 +15133,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15191,8 +15191,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16796,8 +16796,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
; GFX8_ITERATIVE-NEXT: ; %bb.3:
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1
; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
@@ -16851,8 +16851,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
; GFX9_ITERATIVE-NEXT: ; %bb.3:
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16902,8 +16902,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
; GFX1064_ITERATIVE-NEXT: ; %bb.3:
-; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16952,8 +16952,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
; GFX1032_ITERATIVE-NEXT: ; %bb.3:
-; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1
; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0
; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4]
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -17010,8 +17010,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4
; GFX1164_ITERATIVE-NEXT: ; %bb.3:
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0
; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3]
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index e74fd21365c9d7..3c1d6bda69ca09 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -20,8 +20,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -44,8 +44,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -64,7 +64,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v0, s4
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
@@ -75,10 +75,10 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v0, s4
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index bb7974335bf284..af998258377aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -53,8 +53,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
; GISEL-NEXT: s_and_b32 s2, s2, 0xffff
; GISEL-NEXT: s_brev_b32 s2, s2
; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: flat_store_short v[0:1], v2
; GISEL-NEXT: s_endpgm
@@ -137,11 +137,11 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa
; GISEL-NEXT: v_mov_b32_e32 v0, s2
; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_load_ushort v0, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: flat_store_short v[0:1], v2
; GISEL-NEXT: s_endpgm
;
@@ -207,8 +207,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: flat_store_dword v[0:1], v2
; GISEL-NEXT: s_endpgm
@@ -290,10 +290,10 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa
; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: flat_store_dword v[0:1], v2
; GISEL-NEXT: s_endpgm
;
@@ -370,8 +370,8 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
; GISEL-NEXT: s_brev_b32 s2, s2
; GISEL-NEXT: s_brev_b32 s3, s3
; GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-NEXT: v_mov_b32_e32 v3, s1
; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: v_mov_b32_e32 v3, s1
; GISEL-NEXT: v_mov_b32_e32 v2, s0
; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-NEXT: s_endpgm
@@ -536,8 +536,8 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
; GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-NEXT: v_mov_b32_e32 v3, s1
; GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-NEXT: v_mov_b32_e32 v3, s1
; GISEL-NEXT: v_mov_b32_e32 v2, s0
; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-NEXT: s_endpgm
@@ -703,10 +703,10 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1]
; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v4, s4
; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: v_mov_b32_e32 v2, s2
; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
; GISEL-NEXT: v_mov_b32_e32 v5, s5
; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GISEL-NEXT: s_endpgm
@@ -799,9 +799,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_bfrev_b32_e32 v4, v1
; GISEL-NEXT: v_bfrev_b32_e32 v5, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_bfrev_b32_e32 v6, v3
; GISEL-NEXT: v_bfrev_b32_e32 v7, v2
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index f9ffa5ae57f3ed..fcf65c92c86511 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -31,11 +31,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_cbranch_vccz .LBB0_6
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: s_mov_b32 s13, s12
; CHECK-NEXT: s_mov_b32 s14, s12
; CHECK-NEXT: s_mov_b32 s15, s12
-; CHECK-NEXT: s_mov_b32 s13, s12
-; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0
@@ -80,8 +80,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
-; CHECK-NEXT: v_mov_b32_e32 v0, s36
; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s36
; CHECK-NEXT: v_mov_b32_e32 v1, s37
; CHECK-NEXT: v_mov_b32_e32 v2, s38
; CHECK-NEXT: v_mov_b32_e32 v3, s39
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 0ea73ad4c5019a..f325f43df44c3c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -64,8 +64,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -576,8 +576,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: ; Child Loop BB2_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_add_f32_e32 v7, v8, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: v_mov_b32_e32 v7, v8
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -634,8 +634,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: ; Child Loop BB2_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v7, v8, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: v_mov_b32_e32 v7, v8
; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -691,8 +691,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX7-NEXT: ; Child Loop BB2_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_f32_e32 v7, v8, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v7
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v6, v7
; GFX7-NEXT: v_mov_b32_e32 v7, v8
; GFX7-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -748,8 +748,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX6-NEXT: ; Child Loop BB2_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: v_add_f32_e32 v7, v8, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v7
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v6, v7
; GFX6-NEXT: v_mov_b32_e32 v7, v8
; GFX6-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
@@ -835,8 +835,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -1229,7 +1229,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1258,8 +1258,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -1453,7 +1453,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1482,8 +1482,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -1677,7 +1677,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1706,8 +1706,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -1884,10 +1884,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1925,10 +1925,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1959,10 +1959,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -2000,10 +2000,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v10, v1
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2031,10 +2031,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -2062,9 +2062,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v10, v1
; GFX7-NEXT: v_mov_b32_e32 v9, v0
; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2094,9 +2094,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX6-NEXT: v_mov_b32_e32 v9, v0
; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v8
-; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v3, v10
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -2391,11 +2391,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: ; Child Loop BB10_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6]
+; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v9
@@ -2491,11 +2491,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: ; Child Loop BB10_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
@@ -2558,12 +2558,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX10-NEXT: ; Child Loop BB10_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13
+; GFX10-NEXT: v_mov_b32_e32 v3, v14
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, v11
; GFX10-NEXT: v_mov_b32_e32 v1, v12
-; GFX10-NEXT: v_mov_b32_e32 v2, v13
-; GFX10-NEXT: v_mov_b32_e32 v3, v14
; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v9
@@ -2656,10 +2656,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v0, v11
-; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
; GFX908-NEXT: v_mov_b32_e32 v3, v14
+; GFX908-NEXT: v_mov_b32_e32 v0, v11
+; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v9
@@ -2720,10 +2720,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v0, v11
-; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
; GFX8-NEXT: v_mov_b32_e32 v3, v14
+; GFX8-NEXT: v_mov_b32_e32 v0, v11
+; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v9
@@ -2904,10 +2904,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -2945,10 +2945,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -2979,10 +2979,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -3039,10 +3039,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v10, v1
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3070,10 +3070,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3101,9 +3101,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v10, v1
; GFX7-NEXT: v_mov_b32_e32 v9, v0
; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3133,9 +3133,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX6-NEXT: v_mov_b32_e32 v9, v0
; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v8
-; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v3, v10
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -3173,10 +3173,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3214,10 +3214,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -3248,10 +3248,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -3289,10 +3289,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v10, v1
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -3320,10 +3320,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3351,9 +3351,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX7-NEXT: v_mov_b32_e32 v10, v1
; GFX7-NEXT: v_mov_b32_e32 v9, v0
; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3383,9 +3383,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX6-NEXT: v_mov_b32_e32 v9, v0
; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v8
-; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v3, v10
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -4492,8 +4492,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: v_add_f16_e32 v6, v6, v5
; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT: v_mov_b32_e32 v9, v7
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v9, v7
; GFX908-NEXT: v_mov_b32_e32 v8, v6
; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4559,8 +4559,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT: v_mov_b32_e32 v9, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v9, v7
; GFX8-NEXT: v_mov_b32_e32 v8, v6
; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5599,8 +5599,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -5763,8 +5762,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -5987,8 +5985,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6061,8 +6059,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6130,8 +6128,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6200,8 +6198,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6282,7 +6280,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -6311,8 +6309,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -6816,8 +6814,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8
; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6965,8 +6962,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v7, v8, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: v_mov_b32_e32 v7, v8
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -7027,8 +7024,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: v_mov_b32_e32 v7, v8
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -7261,7 +7258,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -7290,8 +7287,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -7790,7 +7787,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -7819,8 +7816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -8380,7 +8377,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -8426,8 +8423,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -9204,8 +9201,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -9437,8 +9433,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -9514,8 +9510,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -9586,8 +9582,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -9660,8 +9656,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
@@ -9796,7 +9792,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -9842,8 +9838,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -10553,7 +10549,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -10599,8 +10595,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -11642,8 +11638,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 7f06d169a6b13b..154c415a900d7a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -603,8 +603,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
; GFX908-NEXT: v_max_f32_e32 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -663,8 +663,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX8-NEXT: v_max_f32_e32 v5, v4, v8
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -813,7 +813,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -844,8 +844,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -1211,9 +1211,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1255,9 +1255,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1309,10 +1309,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1342,10 +1342,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -1607,10 +1607,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v9
@@ -1709,10 +1709,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
@@ -1837,10 +1837,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v11
-; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
; GFX908-NEXT: v_mov_b32_e32 v3, v14
+; GFX908-NEXT: v_mov_b32_e32 v0, v11
+; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v9
@@ -1903,10 +1903,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v11
-; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
; GFX8-NEXT: v_mov_b32_e32 v3, v14
+; GFX8-NEXT: v_mov_b32_e32 v0, v11
+; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v9
@@ -2018,9 +2018,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -2062,9 +2062,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -2097,10 +2097,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -2161,10 +2161,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2194,10 +2194,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -2227,9 +2227,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v9, v0
; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2261,9 +2261,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v8
-; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v3, v10
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -2305,9 +2305,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -2349,9 +2349,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -2403,10 +2403,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2436,10 +2436,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3248,8 +3248,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -3398,8 +3397,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -3607,8 +3605,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: v_max_f16_e32 v4, v4, v10
; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3676,8 +3674,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4720,8 +4718,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -4884,8 +4881,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -5108,8 +5104,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5182,8 +5178,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5252,8 +5248,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5323,8 +5319,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5389,7 +5385,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5451,7 +5447,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
; GFX11-NEXT: v_pk_max_f16 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -5482,8 +5478,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
; GFX10-NEXT: v_pk_max_f16 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -6036,8 +6032,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6166,8 +6161,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v5, v4, v8
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6352,8 +6346,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
; GFX908-NEXT: v_pk_max_f16 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6416,8 +6410,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_max_f16_e32 v5, v5, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6656,7 +6650,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6752,7 +6746,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -6798,8 +6792,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -7507,8 +7501,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7671,8 +7664,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -7904,8 +7896,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -7981,8 +7973,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -8048,13 +8040,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GFX7-NEXT: v_max_f32_e32 v4, v4, v9
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_max_f32_e32 v7, v7, v10
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -8122,13 +8114,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GFX6-NEXT: v_max_f32_e32 v4, v4, v9
; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_max_f32_e32 v7, v7, v10
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index a6eb81fcbf5155..4285c9b2c2a869 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -603,8 +603,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
; GFX908-NEXT: v_min_f32_e32 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -663,8 +663,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX8-NEXT: v_min_f32_e32 v5, v4, v8
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -813,7 +813,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
; GFX11-NEXT: v_min_f32_e32 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -844,8 +844,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
; GFX10-NEXT: v_min_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -1211,9 +1211,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1255,9 +1255,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -1309,10 +1309,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -1342,10 +1342,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -1607,10 +1607,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v9
@@ -1709,10 +1709,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
@@ -1837,10 +1837,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX908-NEXT: s_mov_b64 s[12:13], exec
; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v11
-; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: v_mov_b32_e32 v2, v13
; GFX908-NEXT: v_mov_b32_e32 v3, v14
+; GFX908-NEXT: v_mov_b32_e32 v0, v11
+; GFX908-NEXT: v_mov_b32_e32 v1, v12
; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v9
@@ -1903,10 +1903,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v11
-; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: v_mov_b32_e32 v2, v13
; GFX8-NEXT: v_mov_b32_e32 v3, v14
+; GFX8-NEXT: v_mov_b32_e32 v0, v11
+; GFX8-NEXT: v_mov_b32_e32 v1, v12
; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v9
@@ -2018,9 +2018,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -2062,9 +2062,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -2097,10 +2097,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: v_mov_b32_e32 v2, v9
; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -2161,10 +2161,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2194,10 +2194,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -2227,9 +2227,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX7-NEXT: v_mov_b32_e32 v9, v0
; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v0, v7
; GFX7-NEXT: v_mov_b32_e32 v1, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
; GFX7-NEXT: v_mov_b32_e32 v3, v10
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2261,9 +2261,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v0, v7
; GFX6-NEXT: v_mov_b32_e32 v1, v8
-; GFX6-NEXT: v_mov_b32_e32 v2, v9
; GFX6-NEXT: v_mov_b32_e32 v3, v10
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -2305,9 +2305,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -2349,9 +2349,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -2403,10 +2403,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: v_mov_b32_e32 v9, v0
; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: v_mov_b32_e32 v2, v9
; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -2436,10 +2436,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: v_mov_b32_e32 v9, v0
; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: v_mov_b32_e32 v2, v9
; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -3248,8 +3248,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -3398,8 +3397,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -3607,8 +3605,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: v_min_f16_e32 v4, v4, v10
; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3676,8 +3674,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4720,8 +4718,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -4884,8 +4881,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -5108,8 +5104,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5182,8 +5178,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5252,8 +5248,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5323,8 +5319,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5389,7 +5385,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5451,7 +5447,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
; GFX11-NEXT: v_pk_min_f16 v4, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -5482,8 +5478,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
; GFX10-NEXT: v_pk_min_f16 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -6036,8 +6032,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -6166,8 +6161,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_min_f16 v5, v4, v8
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -6352,8 +6346,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
; GFX908-NEXT: v_pk_min_f16 v5, v4, v8
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6416,8 +6410,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_min_f16_e32 v5, v5, v9
; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -6656,7 +6650,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6752,7 +6746,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
@@ -6798,8 +6792,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
@@ -7507,8 +7501,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7671,8 +7664,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
@@ -7904,8 +7896,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
; GFX908-NEXT: v_mov_b32_e32 v5, v6
; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
@@ -7981,8 +7973,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: v_mov_b32_e32 v5, v6
; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
@@ -8048,13 +8040,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GFX7-NEXT: v_min_f32_e32 v4, v4, v9
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_min_f32_e32 v7, v7, v10
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
@@ -8122,13 +8114,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GFX6-NEXT: v_min_f32_e32 v4, v4, v9
; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_min_f32_e32 v7, v7, v10
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll
index ce55e9171c8180..8f27a599c761f4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll
@@ -11,10 +11,10 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) {
; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v4, s4
; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: v_mov_b32_e32 v2, s2
; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
; GISEL-NEXT: v_mov_b32_e32 v5, s5
; GISEL-NEXT: v_mov_b32_e32 v6, s6
; GISEL-NEXT: v_mov_b32_e32 v7, s7
@@ -68,9 +68,9 @@ define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace
; GISEL-NEXT: s_ashr_i32 s7, s6, 31
; GISEL-NEXT: s_lshl_b64 s[4:5], s[6:7], 5
; GISEL-NEXT: s_add_u32 s4, s8, s4
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v4, s6
; GISEL-NEXT: s_addc_u32 s5, s9, s5
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: v_mov_b32_e32 v1, s1
; GISEL-NEXT: v_mov_b32_e32 v2, s2
; GISEL-NEXT: v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 4ab940288e8c86..80acdaee6993d1 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -273,9 +273,9 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 35d00390067d89..7a7c8b70fc6c39 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4283,9 +4283,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; VI-NEXT: s_mov_b32 s39, 0xe80000
; VI-NEXT: s_add_u32 s36, s36, s3
; VI-NEXT: s_addc_u32 s37, s37, 0
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[8:9]
; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
@@ -4315,9 +4315,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_mov_b32 s39, 0xe8f000
; CI-NEXT: s_add_u32 s36, s36, s3
; CI-NEXT: s_addc_u32 s37, s37, 0
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[8:9]
; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
@@ -4347,9 +4347,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX9-NEXT: s_mov_b32 s39, 0xe00000
; GFX9-NEXT: s_add_u32 s36, s36, s3
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
@@ -4440,9 +4440,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_mov_b32 s32, 0
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
@@ -4475,9 +4475,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
@@ -4510,9 +4510,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
@@ -4598,8 +4598,8 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; VI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; VI-NEXT: s_mov_b32 s42, -1
; VI-NEXT: s_mov_b32 s43, 0xe80000
-; VI-NEXT: s_add_u32 s40, s40, s5
; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24
+; VI-NEXT: s_add_u32 s40, s40, s5
; VI-NEXT: s_addc_u32 s41, s41, 0
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[40:41]
@@ -4622,8 +4622,8 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; CI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; CI-NEXT: s_mov_b32 s42, -1
; CI-NEXT: s_mov_b32 s43, 0xe8f000
-; CI-NEXT: s_add_u32 s40, s40, s5
; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; CI-NEXT: s_add_u32 s40, s40, s5
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
@@ -4646,8 +4646,8 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s42, -1
; GFX9-NEXT: s_mov_b32 s43, 0xe00000
-; GFX9-NEXT: s_add_u32 s40, s40, s5
; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24
+; GFX9-NEXT: s_add_u32 s40, s40, s5
; GFX9-NEXT: s_addc_u32 s41, s41, 0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
@@ -4837,9 +4837,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; VI-NEXT: s_movk_i32 s32, 0x400
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; VI-NEXT: s_movk_i32 s32, 0x400
; VI-NEXT: s_mov_b64 s[2:3], s[38:39]
; VI-NEXT: s_getpc_b64 s[4:5]
; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
@@ -4865,9 +4865,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; CI-NEXT: s_movk_i32 s32, 0x400
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[36:37]
-; CI-NEXT: s_movk_i32 s32, 0x400
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
@@ -4894,9 +4894,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; GFX9-NEXT: s_movk_i32 s32, 0x400
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_movk_i32 s32, 0x400
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
@@ -5378,9 +5378,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
; VI-NEXT: s_mov_b64 s[6:7], s[0:1]
; VI-NEXT: s_mov_b64 s[0:1], s[52:53]
-; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
; VI-NEXT: s_mov_b64 s[2:3], s[54:55]
; VI-NEXT: v_mov_b32_e32 v0, s36
; VI-NEXT: v_mov_b32_e32 v1, s37
@@ -5437,9 +5437,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
; CI-NEXT: v_mov_b32_e32 v0, s5
+; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
; CI-NEXT: s_mov_b64 s[6:7], s[0:1]
; CI-NEXT: s_mov_b64 s[0:1], s[52:53]
-; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
; CI-NEXT: s_mov_b64 s[2:3], s[54:55]
; CI-NEXT: v_mov_b32_e32 v0, s36
; CI-NEXT: v_mov_b32_e32 v1, s37
@@ -5496,9 +5496,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53]
-; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-NEXT: v_mov_b32_e32 v0, s36
; GFX9-NEXT: v_mov_b32_e32 v1, s37
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 56ecfa298a348f..5ef76474d58acc 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -162,14 +162,14 @@ define amdgpu_kernel void @call_coldcc() #0 {
; VI-NEXT: s_addc_u32 s5, s5, coldcc at gotpcrel32@hi+12
; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; VI-NEXT: v_or_b32_e32 v31, v0, v2
; VI-NEXT: s_mov_b64 s[4:5], s[0:1]
; VI-NEXT: s_mov_b64 s[6:7], s[2:3]
; VI-NEXT: s_mov_b64 s[0:1], s[88:89]
-; VI-NEXT: v_or_b32_e32 v31, v0, v2
; VI-NEXT: s_mov_b64 s[2:3], s[90:91]
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: s_mov_b32 s32, 0
@@ -256,14 +256,14 @@ define amdgpu_kernel void @call_fastcc() #0 {
; VI-NEXT: s_addc_u32 s5, s5, fastcc at gotpcrel32@hi+12
; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; VI-NEXT: v_or_b32_e32 v31, v0, v2
; VI-NEXT: s_mov_b64 s[4:5], s[0:1]
; VI-NEXT: s_mov_b64 s[6:7], s[2:3]
; VI-NEXT: s_mov_b64 s[0:1], s[88:89]
-; VI-NEXT: v_or_b32_e32 v31, v0, v2
; VI-NEXT: s_mov_b64 s[2:3], s[90:91]
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index cdea4fd158b04c..04ad8c35b925b5 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -109,8 +109,9 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -207,8 +208,9 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876
; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -627,17 +629,17 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_add_u32 s6, s4, s6
-; CISI-NEXT: v_mov_b32_e32 v0, s4
; CISI-NEXT: s_addc_u32 s7, s5, s7
+; CISI-NEXT: v_mov_b32_e32 v0, s4
; CISI-NEXT: v_mov_b32_e32 v1, s5
; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; CISI-NEXT: v_mov_b32_e32 v2, s6
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s10
; CISI-NEXT: s_mov_b32 s3, s11
+; CISI-NEXT: v_mov_b32_e32 v2, s6
; CISI-NEXT: v_mov_b32_e32 v3, s7
; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
@@ -650,12 +652,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -673,8 +675,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
@@ -732,10 +734,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s6, s4, s6
; GFX11-NEXT: s_addc_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -983,8 +985,9 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, s2, s4
; GFX11-NEXT: s_subb_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -1081,8 +1084,9 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2
; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -1502,17 +1506,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_sub_u32 s6, s4, s6
-; CISI-NEXT: v_mov_b32_e32 v0, s4
; CISI-NEXT: s_subb_u32 s7, s5, s7
+; CISI-NEXT: v_mov_b32_e32 v0, s4
; CISI-NEXT: v_mov_b32_e32 v1, s5
; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; CISI-NEXT: v_mov_b32_e32 v2, s6
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s10
; CISI-NEXT: s_mov_b32 s3, s11
+; CISI-NEXT: v_mov_b32_e32 v2, s6
; CISI-NEXT: v_mov_b32_e32 v3, s7
; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
@@ -1525,12 +1529,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -1548,8 +1552,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
@@ -1607,10 +1611,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s6, s4, s6
; GFX11-NEXT: s_subb_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -2395,8 +2399,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_cmp_ge_u32 s1, s2
; GFX1010-NEXT: s_mov_b32 s1, 0
; GFX1010-NEXT: s_cselect_b32 s0, s3, s0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: .LBB16_3:
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -2557,8 +2561,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2
; GFX1030W32-NEXT: s_mov_b32 s1, 0
; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: .LBB16_3:
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -2718,8 +2722,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2
; GFX1030W64-NEXT: s_mov_b32 s1, 0
; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: .LBB16_3:
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -2902,7 +2906,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_cselect_b32 s0, s3, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: .LBB16_3:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 986dd8a0464244..7e59feba6fe8e0 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -54,13 +54,13 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
; GFX10-NEXT: s_add_u32 s4, s0, 8
; GFX10-NEXT: s_addc_u32 s5, s1, 0
; GFX10-NEXT: s_add_u32 s6, s0, 16
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_addc_u32 s7, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_add_u32 s0, s0, 24
; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v3, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v4, s6
; GFX10-NEXT: v_mov_b32_e32 v5, s7
; GFX10-NEXT: v_mov_b32_e32 v7, s1
@@ -72,11 +72,11 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali
; GFX10-NEXT: flat_load_dword v11, v[6:7]
; GFX10-NEXT: s_add_u32 s0, s2, 8
; GFX10-NEXT: s_addc_u32 s1, s3, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: s_add_u32 s0, s2, 16
; GFX10-NEXT: s_addc_u32 s1, s3, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: s_add_u32 s2, s2, 24
; GFX10-NEXT: s_addc_u32 s3, s3, 0
@@ -179,11 +179,11 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s4, s0, 8
; GFX10-NEXT: s_addc_u32 s5, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_add_u32 s6, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: v_mov_b32_e32 v3, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_addc_u32 s7, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_add_u32 s0, s0, 24
; GFX10-NEXT: s_addc_u32 s1, s1, 0
@@ -199,18 +199,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr
; GFX10-NEXT: s_add_u32 s0, s2, 8
; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: s_add_u32 s4, s2, 16
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: s_addc_u32 s5, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_add_u32 s0, s2, 24
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6
-; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; GFX10-NEXT: flat_store_dword v[0:1], v8
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 3216e71e6221ae..ae2a6959c106b1 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -110,11 +110,11 @@ define void @private_alloca_to_flat(ptr %ptr) {
; GISEL-ASM-LABEL: private_alloca_to_flat:
; GISEL-ASM: ; %bb.0:
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_lshr_b32 s4, s32, 6
; GISEL-ASM-NEXT: s_mov_b64 s[6:7], src_private_base
+; GISEL-ASM-NEXT: s_lshr_b32 s4, s32, 6
; GISEL-ASM-NEXT: s_mov_b32 s5, s7
-; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4
; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4
; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 50c9c0cb64ccd6..839c4b693aa036 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -1018,12 +1018,12 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-NEXT: ; %bb.5: ; %bb4
; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB5_1
; GCN-NEXT: ; %bb.6: ; %bb8
@@ -1179,10 +1179,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: s_mov_b32 s5, s10
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_mov_b32 s7, s8
-; GCN-O0-NEXT: s_waitcnt expcnt(1)
+; GCN-O0-NEXT: s_waitcnt expcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
+; GCN-O0-NEXT: s_waitcnt expcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
+; GCN-O0-NEXT: s_waitcnt expcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
+; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir
index 46a72c032827c0..004830397c98bc 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir
@@ -15,7 +15,7 @@ body: |
; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy
; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; CHECK-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
@@ -37,7 +37,7 @@ body: |
; CHECK-LABEL: name: nonoverlapping_copy_kill
; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6, implicit-def $sgpr0_sgpr1_sgpr2
+ ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6
; CHECK-NEXT: $sgpr2 = S_MOV_B32 $sgpr6, implicit killed $sgpr4_sgpr5_sgpr6
; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc
; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir
index 5efeb8d40afbb0..5286c6d34cf44a 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir
@@ -15,7 +15,7 @@ body: |
; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy
; CHECK: liveins: $sgpr30_sgpr31, $vgpr1_vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr1_vgpr2_vgpr3
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr1_vgpr2_vgpr3
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr1_vgpr2_vgpr3
; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec
@@ -36,7 +36,7 @@ body: |
; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy_1
; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr2_vgpr3_vgpr4
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3_vgpr4
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4
; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec
@@ -57,7 +57,7 @@ body: |
; CHECK-LABEL: name: nonoverlapping_copy_kill
; CHECK: liveins: $sgpr30_sgpr31, $vgpr3_vgpr4_vgpr5
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr3_vgpr4_vgpr5
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr3_vgpr4_vgpr5
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr3_vgpr4_vgpr5
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit killed $vgpr3_vgpr4_vgpr5
; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec
@@ -78,7 +78,7 @@ body: |
; CHECK-LABEL: name: overlapping_copy_kill_half_s128
; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4_vgpr5
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir
index 9376a4c59c170d..06a5cbbcb52384 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir
@@ -14,7 +14,7 @@ body: |
; CHECK-LABEL: name: copy_has_implicit_kill_superreg
; CHECK: renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF
- ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit-def $vgpr7_vgpr8, implicit $vgpr10_vgpr11
+ ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr10_vgpr11
; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr10_vgpr11, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr7
renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
index 7c21b3e0858044..ebfcb5883acd10 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
@@ -14,7 +14,7 @@ body: |
; GFX908-LABEL: name: copy_v64_to_v64
; GFX908: liveins: $vgpr2_vgpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
;
; GFX90A-LABEL: name: copy_v64_to_v64
@@ -30,7 +30,7 @@ body: |
; GFX10-LABEL: name: copy_v64_to_v64
; GFX10: liveins: $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
$vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec
...
@@ -44,7 +44,7 @@ body: |
; GFX908-LABEL: name: copy_s64_to_v64
; GFX908: liveins: $sgpr2_sgpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr2_sgpr3
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
;
; GFX90A-LABEL: name: copy_s64_to_v64
@@ -60,7 +60,7 @@ body: |
; GFX10-LABEL: name: copy_s64_to_v64
; GFX10: liveins: $sgpr2_sgpr3
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr2_sgpr3
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
$vgpr0_vgpr1 = COPY killed $sgpr2_sgpr3, implicit $exec
...
@@ -74,25 +74,25 @@ body: |
; GFX908-LABEL: name: copy_a64_to_v64
; GFX908: liveins: $agpr2_agpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+ ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr2_agpr3
; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
;
; GFX90A-LABEL: name: copy_a64_to_v64
; GFX90A: liveins: $agpr2_agpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+ ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr2_agpr3
; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
;
; GFX940-LABEL: name: copy_a64_to_v64
; GFX940: liveins: $agpr2_agpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+ ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr2_agpr3
; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
;
; GFX10-LABEL: name: copy_a64_to_v64
; GFX10: liveins: $agpr2_agpr3
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
+ ; GFX10-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr2_agpr3
; GFX10-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
$vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec
...
@@ -106,7 +106,7 @@ body: |
; GFX908-LABEL: name: copy_v128_to_v128_fwd
; GFX908: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
@@ -114,19 +114,19 @@ body: |
; GFX90A-LABEL: name: copy_v128_to_v128_fwd
; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
;
; GFX940-LABEL: name: copy_v128_to_v128_fwd
; GFX940: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+ ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
;
; GFX10-LABEL: name: copy_v128_to_v128_fwd
; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
@@ -142,7 +142,7 @@ body: |
; GFX908-LABEL: name: copy_v128_to_v128_back
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
@@ -150,19 +150,19 @@ body: |
; GFX90A-LABEL: name: copy_v128_to_v128_back
; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
;
; GFX940-LABEL: name: copy_v128_to_v128_back
; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX940-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
;
; GFX10-LABEL: name: copy_v128_to_v128_back
; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX10-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
@@ -178,28 +178,28 @@ body: |
; GFX908-LABEL: name: copy_v96_to_v96
; GFX908: liveins: $vgpr4_vgpr5_vgpr6
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
;
; GFX90A-LABEL: name: copy_v96_to_v96
; GFX90A: liveins: $vgpr4_vgpr5_vgpr6
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+ ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
;
; GFX940-LABEL: name: copy_v96_to_v96
; GFX940: liveins: $vgpr4_vgpr5_vgpr6
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+ ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
;
; GFX10-LABEL: name: copy_v96_to_v96
; GFX10: liveins: $vgpr4_vgpr5_vgpr6
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
$vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec
@@ -214,7 +214,7 @@ body: |
; GFX908-LABEL: name: copy_v64_to_v64_undef_sub0
; GFX908: liveins: $vgpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
;
; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0
@@ -230,7 +230,7 @@ body: |
; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0
; GFX10: liveins: $vgpr3
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
$vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec
...
@@ -244,7 +244,7 @@ body: |
; GFX908-LABEL: name: copy_v64_to_v64_undef_sub1
; GFX908: liveins: $vgpr2
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
;
; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1
@@ -260,7 +260,7 @@ body: |
; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1
; GFX10: liveins: $vgpr2
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
$vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec
...
@@ -274,7 +274,7 @@ body: |
; GFX908-LABEL: name: copy_s128_to_v128_killed
; GFX908: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
@@ -282,19 +282,19 @@ body: |
; GFX90A-LABEL: name: copy_s128_to_v128_killed
; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
;
; GFX940-LABEL: name: copy_s128_to_v128_killed
; GFX940: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
;
; GFX10-LABEL: name: copy_s128_to_v128_killed
; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
@@ -310,25 +310,25 @@ body: |
; GFX908-LABEL: name: copy_v64_to_v64_unaligned
; GFX908: liveins: $vgpr2_vgpr3
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
;
; GFX90A-LABEL: name: copy_v64_to_v64_unaligned
; GFX90A: liveins: $vgpr2_vgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
;
; GFX940-LABEL: name: copy_v64_to_v64_unaligned
; GFX940: liveins: $vgpr2_vgpr3
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
+ ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
;
; GFX10-LABEL: name: copy_v64_to_v64_unaligned
; GFX10: liveins: $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr2_vgpr3
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
$vgpr1_vgpr2 = COPY killed $vgpr2_vgpr3, implicit $exec
...
@@ -342,25 +342,25 @@ body: |
; GFX908-LABEL: name: copy_v64_unaligned_to_v64
; GFX908: liveins: $vgpr3_vgpr4
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr3_vgpr4
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
;
; GFX90A-LABEL: name: copy_v64_unaligned_to_v64
; GFX90A: liveins: $vgpr3_vgpr4
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
+ ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr3_vgpr4
; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
;
; GFX940-LABEL: name: copy_v64_unaligned_to_v64
; GFX940: liveins: $vgpr3_vgpr4
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
+ ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr3_vgpr4
; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
;
; GFX10-LABEL: name: copy_v64_unaligned_to_v64
; GFX10: liveins: $vgpr3_vgpr4
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr3_vgpr4
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
$vgpr0_vgpr1 = COPY killed $vgpr3_vgpr4, implicit $exec
...
@@ -374,7 +374,7 @@ body: |
; GFX908-LABEL: name: copy_v128_to_v128_unaligned
; GFX908: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
@@ -382,7 +382,7 @@ body: |
; GFX90A-LABEL: name: copy_v128_to_v128_unaligned
; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
@@ -390,7 +390,7 @@ body: |
; GFX940-LABEL: name: copy_v128_to_v128_unaligned
; GFX940: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
@@ -398,7 +398,7 @@ body: |
; GFX10-LABEL: name: copy_v128_to_v128_unaligned
; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
@@ -414,7 +414,7 @@ body: |
; GFX908-LABEL: name: copy_v128_unaligned_to_v128
; GFX908: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
@@ -422,7 +422,7 @@ body: |
; GFX90A-LABEL: name: copy_v128_unaligned_to_v128
; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
@@ -430,7 +430,7 @@ body: |
; GFX940-LABEL: name: copy_v128_unaligned_to_v128
; GFX940: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
@@ -438,7 +438,7 @@ body: |
; GFX10-LABEL: name: copy_v128_unaligned_to_v128
; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
@@ -454,25 +454,25 @@ body: |
; GFX908-LABEL: name: copy_s64_to_v64_unaligned
; GFX908: liveins: $sgpr8_sgpr9
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
;
; GFX90A-LABEL: name: copy_s64_to_v64_unaligned
; GFX90A: liveins: $sgpr8_sgpr9
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
;
; GFX940-LABEL: name: copy_s64_to_v64_unaligned
; GFX940: liveins: $sgpr8_sgpr9
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
+ ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
;
; GFX10-LABEL: name: copy_s64_to_v64_unaligned
; GFX10: liveins: $sgpr8_sgpr9
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
$vgpr1_vgpr2 = COPY killed $sgpr8_sgpr9, implicit $exec
...
@@ -486,7 +486,7 @@ body: |
; GFX908-LABEL: name: copy_s128_to_v128_unaligned
; GFX908: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
@@ -494,7 +494,7 @@ body: |
; GFX90A-LABEL: name: copy_s128_to_v128_unaligned
; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
@@ -502,7 +502,7 @@ body: |
; GFX940-LABEL: name: copy_s128_to_v128_unaligned
; GFX940: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
@@ -510,7 +510,7 @@ body: |
; GFX10-LABEL: name: copy_s128_to_v128_unaligned
; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
@@ -526,28 +526,28 @@ body: |
; GFX908-LABEL: name: copy_v96_to_v96_unaligned
; GFX908: liveins: $vgpr8_vgpr9_vgpr10
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
;
; GFX90A-LABEL: name: copy_v96_to_v96_unaligned
; GFX90A: liveins: $vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
;
; GFX940-LABEL: name: copy_v96_to_v96_unaligned
; GFX940: liveins: $vgpr8_vgpr9_vgpr10
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
+ ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
;
; GFX10-LABEL: name: copy_v96_to_v96_unaligned
; GFX10: liveins: $vgpr8_vgpr9_vgpr10
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
$vgpr1_vgpr2_vgpr3 = COPY killed $vgpr8_vgpr9_vgpr10, implicit $exec
@@ -562,28 +562,28 @@ body: |
; GFX908-LABEL: name: copy_v96_unaligned_to_v96
; GFX908: liveins: $vgpr7_vgpr8_vgpr9
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
;
; GFX90A-LABEL: name: copy_v96_unaligned_to_v96
; GFX90A: liveins: $vgpr7_vgpr8_vgpr9
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
+ ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
;
; GFX940-LABEL: name: copy_v96_unaligned_to_v96
; GFX940: liveins: $vgpr7_vgpr8_vgpr9
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
+ ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
;
; GFX10-LABEL: name: copy_v96_unaligned_to_v96
; GFX10: liveins: $vgpr7_vgpr8_vgpr9
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
$vgpr0_vgpr1_vgpr2 = COPY killed $vgpr7_vgpr8_vgpr9, implicit $exec
@@ -598,28 +598,28 @@ body: |
; GFX908-LABEL: name: copy_s96_to_v96
; GFX908: liveins: $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
;
; GFX90A-LABEL: name: copy_s96_to_v96
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
;
; GFX940-LABEL: name: copy_s96_to_v96
; GFX940: liveins: $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
;
; GFX10-LABEL: name: copy_s96_to_v96
; GFX10: liveins: $sgpr0_sgpr1_sgpr2
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
$vgpr0_vgpr1_vgpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec
@@ -634,28 +634,28 @@ body: |
; GFX908-LABEL: name: copy_s96_to_v96_unaligned
; GFX908: liveins: $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
;
; GFX90A-LABEL: name: copy_s96_to_v96_unaligned
; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
;
; GFX940-LABEL: name: copy_s96_to_v96_unaligned
; GFX940: liveins: $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
;
; GFX10-LABEL: name: copy_s96_to_v96_unaligned
; GFX10: liveins: $sgpr0_sgpr1_sgpr2
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
+ ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
$vgpr1_vgpr2_vgpr3 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index b4d450a90d5950..86a1f70ca5e375 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -660,8 +660,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1]
; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 84fcb3718c00ca..096d9685bb980e 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -106,10 +106,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -537,8 +537,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -702,12 +702,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_readfirstlane_b32 s2, v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -720,7 +721,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, s3, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
@@ -821,14 +821,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_add_u32 s4, s2, 2
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
@@ -1014,8 +1014,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_min_u32_e32 v0, v0, v3
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_min_u32_e32 v0, 64, v0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1124,11 +1124,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1206,8 +1206,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_flbit_i32_b64 s0, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1510,10 +1510,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1589,10 +1589,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1666,10 +1666,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -2224,13 +2224,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 14
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_flbit_i32_b32 s2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_flbit_i32_b32 s2, s2
; VI-NEXT: s_add_u32 s0, s0, 2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 3504546801c93b..118d73eebb08b1 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -351,9 +351,9 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7]
; SI-NEXT: s_mov_b32 s5, 0
; SI-NEXT: .LBB7_3: ; %endif
-; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -377,10 +377,10 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7]
; VI-NEXT: s_mov_b32 s5, 0
; VI-NEXT: .LBB7_3: ; %endif
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index f0c278a67c8bcc..b3778a02576470 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -572,8 +572,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1]
; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_endpgm
%cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index c4a742f4bf08df..a8318b94af559d 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -93,10 +93,10 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -514,8 +514,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ff1_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -673,8 +673,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
@@ -785,14 +785,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_add_u32 s4, s2, 2
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
@@ -978,8 +978,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_or_b32_e32 v0, v4, v0
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, v3, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_min_u32_e32 v0, 64, v0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1100,14 +1100,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_add_u32 s4, s2, 2
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
@@ -1222,14 +1222,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_add_u32 s4, s2, 2
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
@@ -1347,14 +1347,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_add_u32 s4, s2, 2
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
@@ -1551,12 +1551,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_readfirstlane_b32 s2, v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1569,7 +1570,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, s3, 0xffff
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 59bc7f332bf1e4..7ce9b8b2723e73 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -185,9 +185,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v15, v23, v25
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_and_b32_e32 v6, 1, v30
-; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB0_3
@@ -1349,16 +1349,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18
+; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc
; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20
-; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc
; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5]
; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5]
-; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc
-; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX9-G-NEXT: v_mov_b32_e32 v0, s8
; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-G-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-NEXT: v_mov_b32_e32 v0, s8
; GFX9-G-NEXT: v_mov_b32_e32 v1, s9
; GFX9-G-NEXT: v_mov_b32_e32 v2, s10
; GFX9-G-NEXT: v_mov_b32_e32 v3, s11
@@ -1391,14 +1391,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc
; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
-; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13]
; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22
; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13]
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8
; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28
-; GFX9-G-NEXT: v_mov_b32_e32 v0, v8
; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-G-NEXT: v_mov_b32_e32 v0, v8
; GFX9-G-NEXT: v_mov_b32_e32 v1, v9
; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3
@@ -2472,9 +2472,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14
; GFX9-NEXT: v_and_b32_e32 v12, 1, v26
-; GFX9-NEXT: v_mov_b32_e32 v17, v13
; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v17, v13
; GFX9-NEXT: v_mov_b32_e32 v16, v12
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB1_3
@@ -3457,17 +3457,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11
; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1]
; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX9-G-NEXT: v_mov_b32_e32 v13, s11
; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-G-NEXT: v_mov_b32_e32 v11, s9
; GFX9-G-NEXT: v_mov_b32_e32 v10, s8
+; GFX9-G-NEXT: v_mov_b32_e32 v13, s11
; GFX9-G-NEXT: v_mov_b32_e32 v12, s10
; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
@@ -3482,23 +3482,23 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12
; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13
; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4
; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
-; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
-; GFX9-G-NEXT: v_mov_b32_e32 v13, s11
; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5]
; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-G-NEXT: v_mov_b32_e32 v1, 0
; GFX9-G-NEXT: v_mov_b32_e32 v11, s9
; GFX9-G-NEXT: v_mov_b32_e32 v10, s8
+; GFX9-G-NEXT: v_mov_b32_e32 v13, s11
; GFX9-G-NEXT: v_mov_b32_e32 v12, s10
; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while
; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3535,8 +3535,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0
; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12
-; GFX9-G-NEXT: v_mov_b32_e32 v11, v1
; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-G-NEXT: v_mov_b32_e32 v11, v1
; GFX9-G-NEXT: v_mov_b32_e32 v10, v0
; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 4d9c85ef99dcd1..c187b75cce4d13 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1327,28 +1327,28 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_mov_b32 s14, s10
; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_mov_b32 s12, s8
+; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s40, s40, s11
; CI-NEXT: s_mov_b64 s[10:11], s[6:7]
; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
; CI-NEXT: s_load_dword s6, s[4:5], 0x2
; CI-NEXT: s_addc_u32 s41, s41, 0
+; CI-NEXT: s_mov_b32 s12, s8
; CI-NEXT: s_add_u32 s8, s4, 12
-; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; CI-NEXT: s_mov_b32 s13, s9
+; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3
; CI-NEXT: ds_read_b32 v41, v40
-; CI-NEXT: s_addc_u32 s9, s5, 0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; CI-NEXT: v_or_b32_e32 v0, v0, v1
-; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
-; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
-; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT: s_mov_b32 s13, s9
+; CI-NEXT: s_addc_u32 s9, s5, 0
; CI-NEXT: s_mov_b32 s17, void_func_void at abs32@hi
; CI-NEXT: s_mov_b32 s16, void_func_void at abs32@lo
; CI-NEXT: v_or_b32_e32 v31, v0, v2
+; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b32 s39, 0xf000
@@ -1367,8 +1367,8 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s9
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1376,17 +1376,17 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_add_u32 s8, s4, 12
-; GFX9-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-NEXT: s_mov_b32 s13, s9
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6
; GFX9-NEXT: ds_read_b32 v42, v41
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: s_addc_u32 s9, s5, 0
; GFX9-NEXT: s_mov_b32 s17, void_func_void at abs32@hi
; GFX9-NEXT: s_mov_b32 s16, void_func_void at abs32@lo
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_mov_b32_e32 v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 1f805b6d07f711..4b9056c9a3b96a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -840,8 +840,8 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_mov_b64 s[0:1], 0x7b
-; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
+; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
@@ -851,8 +851,8 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 67f2487aed73af..fdce37550185af 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -68,9 +68,9 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_cmp_eq_u32 s2, 3
; GCN-NEXT: s_cselect_b32 s2, 0x40100a3d, s3
; GCN-NEXT: s_cselect_b32 s3, 0x70a3d70a, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -105,9 +105,9 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
; GCN-NEXT: s_cselect_b32 s2, 0x70a3d70a, s8
-; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -146,8 +146,8 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s2, 1
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -168,9 +168,9 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_cmp_eq_u32 s2, 1
; GCN-NEXT: s_cselect_b32 s2, s3, 0x3f847ae1
; GCN-NEXT: s_cselect_b32 s3, s4, 0x47ae147b
-; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -287,15 +287,17 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GCN-NEXT: s_load_dword s18, s[4:5], 0x2c
-; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_mov_b32 s15, 0x40200000
+; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
+; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_mov_b32 s13, 0x401c0000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s18, s18, 1
; GCN-NEXT: s_mov_b32 s11, 0x40180000
; GCN-NEXT: s_mov_b32 s9, 0x40140000
; GCN-NEXT: s_mov_b32 s7, 0x40100000
; GCN-NEXT: s_mov_b32 s5, 0x40080000
; GCN-NEXT: s_mov_b32 s3, 2.0
-; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
; GCN-NEXT: s_mov_b32 s2, s0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s6, s0
@@ -303,8 +305,6 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s10, s0
; GCN-NEXT: s_mov_b32 s12, s0
; GCN-NEXT: s_mov_b32 s14, s0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s18, s18, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v15, s15
@@ -339,22 +339,22 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x24
; GCN-NEXT: s_load_dword s16, s[4:5], 0x2c
+; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_mov_b32 s13, 0x401c0000
; GCN-NEXT: s_mov_b32 s11, 0x40180000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s16, s16, 1
; GCN-NEXT: s_mov_b32 s9, 0x40140000
; GCN-NEXT: s_mov_b32 s7, 0x40100000
; GCN-NEXT: s_mov_b32 s5, 0x40080000
; GCN-NEXT: s_mov_b32 s3, 2.0
-; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
; GCN-NEXT: s_mov_b32 s2, s0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s6, s0
; GCN-NEXT: s_mov_b32 s8, s0
; GCN-NEXT: s_mov_b32 s10, s0
; GCN-NEXT: s_mov_b32 s12, s0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s16, s16, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v15, s15
@@ -423,8 +423,11 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: s_mov_b32 s37, 0x3ff00000
; GCN-NEXT: s_mov_b32 s36, 0
; GCN-NEXT: s_mov_b32 s65, 0x402e0000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: s_mov_b32 s63, 0x402c0000
; GCN-NEXT: s_mov_b32 s61, 0x402a0000
; GCN-NEXT: s_mov_b32 s59, 0x40280000
@@ -438,7 +441,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s43, 0x40100000
; GCN-NEXT: s_mov_b32 s41, 0x40080000
; GCN-NEXT: s_mov_b32 s39, 2.0
-; GCN-NEXT: s_mov_b32 s37, 0x3ff00000
; GCN-NEXT: s_mov_b32 s38, s36
; GCN-NEXT: s_mov_b32 s40, s36
; GCN-NEXT: s_mov_b32 s42, s36
@@ -453,8 +455,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s60, s36
; GCN-NEXT: s_mov_b32 s62, s36
; GCN-NEXT: s_mov_b32 s64, s36
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
@@ -505,8 +505,11 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT: s_mov_b32 s36, 0
; GCN-NEXT: s_mov_b32 s67, 0x40300000
+; GCN-NEXT: s_mov_b32 s37, 0x3ff00000
+; GCN-NEXT: s_mov_b32 s36, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: s_mov_b32 s65, 0x402e0000
; GCN-NEXT: s_mov_b32 s63, 0x402c0000
; GCN-NEXT: s_mov_b32 s61, 0x402a0000
@@ -521,7 +524,6 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s43, 0x40100000
; GCN-NEXT: s_mov_b32 s41, 0x40080000
; GCN-NEXT: s_mov_b32 s39, 2.0
-; GCN-NEXT: s_mov_b32 s37, 0x3ff00000
; GCN-NEXT: s_mov_b32 s38, s36
; GCN-NEXT: s_mov_b32 s40, s36
; GCN-NEXT: s_mov_b32 s42, s36
@@ -537,8 +539,6 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: s_mov_b32 s62, s36
; GCN-NEXT: s_mov_b32 s64, s36
; GCN-NEXT: s_mov_b32 s66, s36
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v31, s67
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 7a81af5243ee07..f1e2316e6edd52 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -172,9 +172,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -185,9 +185,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c919..da2bf7faff9833 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -117,9 +117,9 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s3, 31
; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 6496b70b4d6973..aa1d36006197df 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -76,13 +76,13 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0
; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3
; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2
; GFX7-ALIGNED-NEXT: s_endpgm
@@ -220,22 +220,22 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1
-; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
+; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
-; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index a4573388731578..14cb82f7305e57 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1888,12 +1888,12 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_bfi_b32 v1, s6, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: s_add_u32 s0, s4, 4
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s0, s4, 4
-; VI-NEXT: v_bfi_b32 v3, s6, v0, v1
; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_bfi_b32 v3, s6, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v3
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 5f75a2f29a026f..edfeb6303ebb94 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -34,9 +34,9 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -80,9 +80,9 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -124,9 +124,9 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -168,9 +168,9 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -212,9 +212,9 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -256,9 +256,9 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -306,9 +306,9 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_bfi_b32 v1, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -359,9 +359,9 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -634,10 +634,10 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; VI-NEXT: v_mov_b32_e32 v2, s9
; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -704,13 +704,13 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -787,14 +787,14 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 5b024a345edbab..4176fc7c07e29c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -1498,11 +1498,11 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1563,11 +1563,11 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1628,11 +1628,11 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 33910947e6fac8..9f153dbc342ced 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -543,9 +543,9 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -605,9 +605,9 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -667,9 +667,9 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -892,9 +892,9 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1288,10 +1288,10 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
; GFX8-NEXT: v_rcp_f32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1364,10 +1364,10 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s3
; GFX8-NEXT: v_rcp_f32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -2163,10 +2163,10 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
-; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2313,10 +2313,10 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-NEXT: v_fma_f32 v3, v4, v2, v3
; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
-; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
index 8155ac7eb256ec..1860698c07df09 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -9,17 +9,17 @@
define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
; GCN-LABEL: same_address_fence_merge_write2:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GCN-NEXT: s_mov_b32 s1, 0x40100000
+; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_add_u32_e32 v3, 0x800, v2
+; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
-; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 5415af02ef89ca..1362e2d01a351c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -18,8 +18,8 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: s_mov_b32 s2, s0
; GFX9-NEXT: s_mov_b32 s3, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
@@ -93,8 +93,8 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11
-; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index e674b57aae3efc..a60ab281bb65cb 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4290,8 +4290,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4305,8 +4305,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4318,8 +4318,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4340,8 +4340,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4359,8 +4359,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4376,8 +4376,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4408,8 +4408,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
; GCN1-NEXT: s_addc_u32 s1, s3, s1
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4429,8 +4429,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
; GCN2-NEXT: s_addc_u32 s1, s3, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4448,8 +4448,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: s_add_u32 s0, s2, s0
; GCN3-NEXT: s_addc_u32 s1, s3, s1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s7
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4476,8 +4476,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
; GCN1-NEXT: s_addc_u32 s1, s1, s5
; GCN1-NEXT: s_add_u32 s0, s0, 16
; GCN1-NEXT: s_addc_u32 s1, s1, 0
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s9
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4500,8 +4500,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
; GCN2-NEXT: s_addc_u32 s1, s1, s5
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4522,8 +4522,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
; GCN3-NEXT: v_mov_b32_e32 v0, s8
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s9
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4547,8 +4547,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4560,8 +4560,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4573,8 +4573,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s2
; GCN3-NEXT: v_mov_b32_e32 v3, s3
; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4592,8 +4592,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4609,8 +4609,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4626,8 +4626,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4655,8 +4655,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: s_add_u32 s0, s2, s0
; GCN1-NEXT: s_addc_u32 s1, s3, s1
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4674,8 +4674,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: s_add_u32 s0, s2, s0
; GCN2-NEXT: s_addc_u32 s1, s3, s1
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4693,8 +4693,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: s_add_u32 s0, s2, s0
; GCN3-NEXT: s_addc_u32 s1, s3, s1
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s7
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4718,8 +4718,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
; GCN1-NEXT: v_mov_b32_e32 v0, s8
; GCN1-NEXT: s_add_u32 s0, s0, s4
; GCN1-NEXT: s_addc_u32 s1, s1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s9
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4740,8 +4740,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: s_add_u32 s0, s0, s4
; GCN2-NEXT: s_addc_u32 s1, s1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4762,8 +4762,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s8
; GCN3-NEXT: s_add_u32 s0, s0, s4
; GCN3-NEXT: s_addc_u32 s1, s1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v1, s9
+; GCN3-NEXT: v_mov_b32_e32 v3, s1
; GCN3-NEXT: v_mov_b32_e32 v2, s0
; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 1311560715ddd7..553f66dbbc2bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -2215,8 +2215,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2241,8 +2241,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2267,8 +2267,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2349,8 +2349,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3668,8 +3668,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3693,8 +3693,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3718,8 +3718,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3797,8 +3797,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4811,8 +4811,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4836,8 +4836,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4861,8 +4861,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4940,8 +4940,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5864,8 +5864,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5889,8 +5889,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5914,8 +5914,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5993,8 +5993,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6605,8 +6605,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
; GCN1-NEXT: flat_load_dword v0, v[0:1]
-; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
; GCN1-NEXT: v_mov_b32_e32 v2, s5
; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6630,8 +6630,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v0, v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
; GCN2-NEXT: v_mov_b32_e32 v2, s5
; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6655,8 +6655,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6734,8 +6734,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
; GCN3-NEXT: v_mov_b32_e32 v2, s5
; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0b6bdedeb48fc9..cbc629bffeddf1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -29,8 +29,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -79,8 +79,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -163,8 +163,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB1_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -215,8 +215,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB1_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -318,8 +318,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -372,8 +372,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -464,8 +464,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB3_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -518,8 +518,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB3_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -617,8 +617,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -665,8 +665,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -745,8 +745,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB5_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -795,8 +795,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB5_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -894,8 +894,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -946,8 +946,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1033,8 +1033,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB7_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1085,8 +1085,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB7_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1184,8 +1184,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1233,8 +1233,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1316,8 +1316,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB9_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1367,8 +1367,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB9_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1469,8 +1469,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1522,8 +1522,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1613,8 +1613,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB11_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1666,8 +1666,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB11_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1764,8 +1764,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1811,8 +1811,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1890,8 +1890,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB13_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1939,8 +1939,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB13_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2037,8 +2037,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2088,8 +2088,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2174,8 +2174,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB15_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2225,8 +2225,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB15_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2323,8 +2323,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2373,8 +2373,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2457,8 +2457,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB17_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2509,8 +2509,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB17_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2612,8 +2612,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2666,8 +2666,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2758,8 +2758,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB19_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2812,8 +2812,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB19_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2911,8 +2911,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2959,8 +2959,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3039,8 +3039,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB21_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3089,8 +3089,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB21_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3188,8 +3188,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3240,8 +3240,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3327,8 +3327,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB23_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3379,8 +3379,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB23_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3478,8 +3478,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -3528,8 +3528,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -3613,8 +3613,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB25_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -3666,8 +3666,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB25_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -3771,8 +3771,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -3825,8 +3825,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -3918,8 +3918,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB27_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -3973,8 +3973,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB27_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4074,8 +4074,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4122,8 +4122,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4203,8 +4203,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB29_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4254,8 +4254,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB29_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4355,8 +4355,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4407,8 +4407,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4495,8 +4495,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB31_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4548,8 +4548,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB31_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4649,8 +4649,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4699,8 +4699,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4784,8 +4784,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_cbranch_vccz .LBB33_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4837,8 +4837,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_vccz .LBB33_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -4942,8 +4942,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -4996,8 +4996,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5089,8 +5089,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN1-NEXT: s_cbranch_vccz .LBB35_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -5144,8 +5144,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_vccz .LBB35_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5245,8 +5245,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -5293,8 +5293,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5374,8 +5374,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB37_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -5425,8 +5425,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB37_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5526,8 +5526,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -5578,8 +5578,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5666,8 +5666,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_cbranch_vccz .LBB39_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -5719,8 +5719,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_vccz .LBB39_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5820,8 +5820,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -5870,8 +5870,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -5955,8 +5955,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB41_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6008,8 +6008,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB41_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6113,8 +6113,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6167,8 +6167,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6260,8 +6260,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB43_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6315,8 +6315,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB43_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6416,8 +6416,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6464,8 +6464,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6545,8 +6545,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB45_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6596,8 +6596,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB45_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6697,8 +6697,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6749,8 +6749,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6837,8 +6837,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB47_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -6890,8 +6890,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB47_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -6991,8 +6991,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7041,8 +7041,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -7126,8 +7126,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_cbranch_vccz .LBB49_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7179,8 +7179,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_vccz .LBB49_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -7284,8 +7284,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7338,8 +7338,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -7431,8 +7431,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN1-NEXT: s_cbranch_vccz .LBB51_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7486,8 +7486,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_vccz .LBB51_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -7587,8 +7587,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7635,8 +7635,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -7716,8 +7716,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB53_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7767,8 +7767,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB53_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -7868,8 +7868,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -7920,8 +7920,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -8008,8 +8008,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_cbranch_vccz .LBB55_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
@@ -8061,8 +8061,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_vccz .LBB55_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -8162,8 +8162,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8211,8 +8211,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8294,8 +8294,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GCN1-NEXT: s_cbranch_vccz .LBB57_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8345,8 +8345,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GCN2-NEXT: s_cbranch_vccz .LBB57_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8447,8 +8447,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8500,8 +8500,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8591,8 +8591,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB59_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8644,8 +8644,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB59_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8742,8 +8742,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8789,8 +8789,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8868,8 +8868,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB61_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8917,8 +8917,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB61_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9015,8 +9015,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9066,8 +9066,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9152,8 +9152,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
; GCN1-NEXT: s_cbranch_vccz .LBB63_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9203,8 +9203,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
; GCN2-NEXT: s_cbranch_vccz .LBB63_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9301,8 +9301,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9346,8 +9346,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9427,8 +9427,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9472,8 +9472,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9553,8 +9553,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9598,8 +9598,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9674,8 +9674,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_cbranch_vccz .LBB67_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9724,8 +9724,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_vccz .LBB67_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9824,8 +9824,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9873,8 +9873,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9957,8 +9957,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN1-NEXT: s_cbranch_vccz .LBB69_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10009,8 +10009,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_vccz .LBB69_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10105,8 +10105,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10148,8 +10148,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10220,8 +10220,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB71_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10268,8 +10268,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB71_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10364,8 +10364,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10411,8 +10411,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10490,8 +10490,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN1-NEXT: s_cbranch_vccz .LBB73_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10540,8 +10540,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_vccz .LBB73_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10636,8 +10636,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10685,8 +10685,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10768,8 +10768,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB75_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10819,8 +10819,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB75_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10921,8 +10921,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10974,8 +10974,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11065,8 +11065,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB77_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11118,8 +11118,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB77_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11216,8 +11216,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11263,8 +11263,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11342,8 +11342,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB79_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11391,8 +11391,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB79_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11489,8 +11489,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11540,8 +11540,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11626,8 +11626,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB81_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11677,8 +11677,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB81_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12183,11 +12183,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v4, s0
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12237,11 +12237,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12334,11 +12334,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN1-NEXT: .LBB91_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB91_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v4, s0
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12388,11 +12388,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN2-NEXT: .LBB91_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB91_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12478,11 +12478,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN1-NEXT: s_cbranch_vccz .LBB92_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s13
; GCN1-NEXT: v_mov_b32_e32 v2, s14
; GCN1-NEXT: v_mov_b32_e32 v3, s15
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v4, s0
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12532,11 +12532,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GCN2-NEXT: s_cbranch_vccz .LBB92_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s13
; GCN2-NEXT: v_mov_b32_e32 v2, s14
; GCN2-NEXT: v_mov_b32_e32 v3, s15
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12638,11 +12638,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s10
; GCN1-NEXT: v_mov_b32_e32 v1, s11
; GCN1-NEXT: v_mov_b32_e32 v2, s14
; GCN1-NEXT: v_mov_b32_e32 v3, s15
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v4, s0
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12694,11 +12694,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s10
; GCN2-NEXT: v_mov_b32_e32 v1, s11
; GCN2-NEXT: v_mov_b32_e32 v2, s14
; GCN2-NEXT: v_mov_b32_e32 v3, s15
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12790,11 +12790,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GCN1-NEXT: s_cbranch_vccz .LBB94_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s3
; GCN1-NEXT: v_mov_b32_e32 v0, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s13
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v5, s3
; GCN1-NEXT: v_mov_b32_e32 v4, s2
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12848,11 +12848,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GCN2-NEXT: s_cbranch_vccz .LBB94_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
; GCN2-NEXT: v_mov_b32_e32 v0, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s13
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v5, s3
; GCN2-NEXT: v_mov_b32_e32 v4, s2
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12956,11 +12956,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s3
; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v4, s0
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13008,11 +13008,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13094,11 +13094,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GCN1-NEXT: s_cbranch_vccz .LBB96_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v4, s8
; GCN1-NEXT: v_mov_b32_e32 v0, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s13
; GCN1-NEXT: v_mov_b32_e32 v2, s14
; GCN1-NEXT: v_mov_b32_e32 v3, s15
+; GCN1-NEXT: v_mov_b32_e32 v4, s8
; GCN1-NEXT: v_mov_b32_e32 v5, s9
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13146,11 +13146,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GCN2-NEXT: s_cbranch_vccz .LBB96_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v4, s8
; GCN2-NEXT: v_mov_b32_e32 v0, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s13
; GCN2-NEXT: v_mov_b32_e32 v2, s14
; GCN2-NEXT: v_mov_b32_e32 v3, s15
+; GCN2-NEXT: v_mov_b32_e32 v4, s8
; GCN2-NEXT: v_mov_b32_e32 v5, s9
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13248,11 +13248,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GCN1-NEXT: .LBB97_2: ; %atomicrmw.phi
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v0, s10
; GCN1-NEXT: v_mov_b32_e32 v1, s11
; GCN1-NEXT: v_mov_b32_e32 v2, s14
; GCN1-NEXT: v_mov_b32_e32 v3, s15
+; GCN1-NEXT: v_mov_b32_e32 v5, s1
; GCN1-NEXT: v_mov_b32_e32 v4, s0
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13302,11 +13302,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GCN2-NEXT: .LBB97_2: ; %atomicrmw.phi
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s10
; GCN2-NEXT: v_mov_b32_e32 v1, s11
; GCN2-NEXT: v_mov_b32_e32 v2, s14
; GCN2-NEXT: v_mov_b32_e32 v3, s15
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13394,11 +13394,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GCN1-NEXT: s_cbranch_vccz .LBB98_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v5, s3
; GCN1-NEXT: v_mov_b32_e32 v0, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s13
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v5, s3
; GCN1-NEXT: v_mov_b32_e32 v4, s2
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13450,11 +13450,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GCN2-NEXT: s_cbranch_vccz .LBB98_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
; GCN2-NEXT: v_mov_b32_e32 v0, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s13
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v5, s3
; GCN2-NEXT: v_mov_b32_e32 v4, s2
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13966,8 +13966,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14018,8 +14018,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14108,8 +14108,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB108_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14162,8 +14162,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB108_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14271,8 +14271,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14327,8 +14327,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14425,8 +14425,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB110_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14481,8 +14481,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB110_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14586,8 +14586,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14636,8 +14636,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14722,8 +14722,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB112_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
; GCN1-NEXT: v_mov_b32_e32 v3, s5
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14774,8 +14774,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB112_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
; GCN2-NEXT: v_mov_b32_e32 v3, s5
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14879,8 +14879,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14933,8 +14933,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15026,8 +15026,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB114_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15080,8 +15080,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB114_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15185,8 +15185,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15240,8 +15240,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15337,8 +15337,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB116_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s2
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15395,8 +15395,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB116_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15512,8 +15512,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15571,8 +15571,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15676,8 +15676,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GCN1-NEXT: s_cbranch_vccz .LBB118_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15736,8 +15736,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_vccz .LBB118_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15849,8 +15849,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15902,8 +15902,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15995,8 +15995,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN1-NEXT: s_cbranch_vccz .LBB120_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s8
-; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v1, s9
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
; GCN1-NEXT: v_mov_b32_e32 v2, s0
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16051,8 +16051,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-NEXT: s_cbranch_vccz .LBB120_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s8
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16164,8 +16164,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GCN1-NEXT: s_endpgm
; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
; GCN1-NEXT: v_mov_b32_e32 v3, s3
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16221,8 +16221,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GCN2-NEXT: s_endpgm
; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16321,8 +16321,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GCN1-NEXT: s_cbranch_vccz .LBB122_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s0
-; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s12
; GCN1-NEXT: v_mov_b32_e32 v3, s13
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16379,8 +16379,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_vccz .LBB122_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s12
; GCN2-NEXT: v_mov_b32_e32 v3, s13
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
index 8991a062f37a4c..75f57fedc81fda 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
@@ -10,9 +10,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -25,9 +25,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -59,8 +59,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -78,8 +78,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -458,9 +458,9 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -473,9 +473,9 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -507,8 +507,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -526,8 +526,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -906,9 +906,9 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -921,9 +921,9 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -955,8 +955,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -974,8 +974,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1354,9 +1354,9 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1368,9 +1368,9 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1401,8 +1401,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1420,8 +1420,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,9 +1794,9 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1808,9 +1808,9 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1841,8 +1841,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1860,8 +1860,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2234,9 +2234,9 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2248,9 +2248,9 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2281,8 +2281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2300,8 +2300,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2674,9 +2674,9 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2688,9 +2688,9 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2721,8 +2721,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2740,8 +2740,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -3114,9 +3114,9 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3129,9 +3129,9 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3163,8 +3163,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3182,8 +3182,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3562,9 +3562,9 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3577,9 +3577,9 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3609,9 +3609,9 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3624,9 +3624,9 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3656,9 +3656,9 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3671,9 +3671,9 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3705,8 +3705,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3724,8 +3724,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4104,9 +4104,9 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4119,9 +4119,9 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4153,8 +4153,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4172,8 +4172,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4961,11 +4961,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4979,11 +4979,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5017,11 +5017,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 0x11940
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5035,11 +5035,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 0x11940
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5072,11 +5072,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5092,11 +5092,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5137,11 +5137,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5157,11 +5157,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5200,10 +5200,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GFX7-NEXT: s_addc_u32 s3, s9, s3
; GFX7-NEXT: s_add_u32 s2, s0, 32
; GFX7-NEXT: s_addc_u32 s3, s3, 0
-; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5224,10 +5224,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GFX8-NEXT: s_addc_u32 s3, s9, s3
; GFX8-NEXT: s_add_u32 s2, s0, 32
; GFX8-NEXT: s_addc_u32 s3, s3, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5379,11 +5379,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX7-NEXT: s_add_u32 s0, s0, s4
; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5397,11 +5397,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5436,11 +5436,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX7-NEXT: s_add_u32 s2, s8, s2
; GFX7-NEXT: s_addc_u32 s3, s9, s3
-; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5458,11 +5458,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; GFX8-NEXT: s_add_u32 s2, s8, s2
; GFX8-NEXT: s_addc_u32 s3, s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5912,9 +5912,9 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5927,9 +5927,9 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5961,8 +5961,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5980,8 +5980,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6360,9 +6360,9 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6375,9 +6375,9 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6409,8 +6409,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6428,8 +6428,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 36bddb7ac2fd68..5429d4519cc0f8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -478,8 +478,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -517,8 +517,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -553,8 +553,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB4_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -595,8 +595,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB5_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -636,8 +636,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB5_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -674,8 +674,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB5_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -709,8 +709,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cbranch_vccz .LBB6_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -748,8 +748,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cbranch_vccz .LBB6_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -784,8 +784,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cbranch_vccz .LBB6_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -826,8 +826,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN1-NEXT: s_cbranch_vccz .LBB7_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -867,8 +867,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN2-NEXT: s_cbranch_vccz .LBB7_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -905,8 +905,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
; GCN3-NEXT: s_cbranch_vccz .LBB7_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1638,8 +1638,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1677,8 +1677,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1713,8 +1713,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB14_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1755,8 +1755,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB15_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1796,8 +1796,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB15_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1834,8 +1834,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB15_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1869,8 +1869,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
; GCN1-NEXT: s_cbranch_vccz .LBB16_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1908,8 +1908,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
; GCN2-NEXT: s_cbranch_vccz .LBB16_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1944,8 +1944,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
; GCN3-NEXT: s_cbranch_vccz .LBB16_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1986,8 +1986,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_cbranch_vccz .LBB17_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2027,8 +2027,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_cbranch_vccz .LBB17_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2065,8 +2065,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_cbranch_vccz .LBB17_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2856,8 +2856,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2900,8 +2900,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2941,8 +2941,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB24_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2988,8 +2988,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB25_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3034,8 +3034,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB25_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3077,8 +3077,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB25_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3117,8 +3117,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cbranch_vccz .LBB26_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3159,8 +3159,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cbranch_vccz .LBB26_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3198,8 +3198,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cbranch_vccz .LBB26_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3243,8 +3243,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cbranch_vccz .LBB27_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3287,8 +3287,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cbranch_vccz .LBB27_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3328,8 +3328,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cbranch_vccz .LBB27_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4151,8 +4151,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4195,8 +4195,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4236,8 +4236,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4283,8 +4283,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4329,8 +4329,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4372,8 +4372,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4412,8 +4412,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cbranch_vccz .LBB36_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4454,8 +4454,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cbranch_vccz .LBB36_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4493,8 +4493,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cbranch_vccz .LBB36_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4538,8 +4538,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cbranch_vccz .LBB37_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4582,8 +4582,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cbranch_vccz .LBB37_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4623,8 +4623,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cbranch_vccz .LBB37_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5446,8 +5446,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5489,8 +5489,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5529,8 +5529,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5575,8 +5575,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5620,8 +5620,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5662,8 +5662,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5701,8 +5701,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cbranch_vccz .LBB46_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5742,8 +5742,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cbranch_vccz .LBB46_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5780,8 +5780,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cbranch_vccz .LBB46_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5824,8 +5824,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cbranch_vccz .LBB47_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5867,8 +5867,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cbranch_vccz .LBB47_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5907,8 +5907,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cbranch_vccz .LBB47_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8644,8 +8644,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8687,8 +8687,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8727,8 +8727,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8773,8 +8773,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8818,8 +8818,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8860,8 +8860,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8899,8 +8899,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN1-NEXT: s_cbranch_vccz .LBB66_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8940,8 +8940,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN2-NEXT: s_cbranch_vccz .LBB66_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8978,8 +8978,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN3-NEXT: s_cbranch_vccz .LBB66_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9022,8 +9022,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cbranch_vccz .LBB67_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9065,8 +9065,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cbranch_vccz .LBB67_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9105,8 +9105,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cbranch_vccz .LBB67_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9927,8 +9927,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9970,8 +9970,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10010,8 +10010,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10056,8 +10056,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10101,8 +10101,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10143,8 +10143,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10182,8 +10182,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cbranch_vccz .LBB76_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10223,8 +10223,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cbranch_vccz .LBB76_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10261,8 +10261,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cbranch_vccz .LBB76_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10305,8 +10305,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cbranch_vccz .LBB77_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10348,8 +10348,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cbranch_vccz .LBB77_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10388,8 +10388,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cbranch_vccz .LBB77_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20984,8 +20984,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB135_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21030,8 +21030,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB135_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21073,8 +21073,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21122,8 +21122,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB136_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21170,8 +21170,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB136_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21215,8 +21215,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21257,8 +21257,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN1-NEXT: s_cbranch_vccz .LBB137_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21301,8 +21301,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN2-NEXT: s_cbranch_vccz .LBB137_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21342,8 +21342,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN3-NEXT: s_cbranch_vccz .LBB137_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21389,8 +21389,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: s_cbranch_vccz .LBB138_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21435,8 +21435,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: s_cbranch_vccz .LBB138_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21478,8 +21478,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: s_cbranch_vccz .LBB138_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22375,8 +22375,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB145_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22424,8 +22424,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB145_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22470,8 +22470,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22522,8 +22522,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22573,8 +22573,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22621,8 +22621,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22666,8 +22666,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN1-NEXT: s_cbranch_vccz .LBB147_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22714,8 +22714,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN2-NEXT: s_cbranch_vccz .LBB147_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22759,8 +22759,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN3-NEXT: s_cbranch_vccz .LBB147_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s5
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22810,8 +22810,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: s_cbranch_vccz .LBB148_2
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v3, s7
; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22860,8 +22860,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: s_cbranch_vccz .LBB148_2
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -22907,8 +22907,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: s_cbranch_vccz .LBB148_2
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s35
+; GCN3-NEXT: v_mov_b32_e32 v2, s6
; GCN3-NEXT: v_mov_b32_e32 v3, s7
; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
index fe47461ebf9569..83b252f142efa7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -175,9 +175,9 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -189,9 +189,9 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -260,9 +260,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -274,9 +274,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -537,9 +537,9 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -551,9 +551,9 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -622,9 +622,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -636,9 +636,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -899,9 +899,9 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -913,9 +913,9 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -984,9 +984,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -998,9 +998,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1261,9 +1261,9 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1275,9 +1275,9 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1346,9 +1346,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1360,9 +1360,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1623,9 +1623,9 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1637,9 +1637,9 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1708,9 +1708,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1722,9 +1722,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2179,16 +2179,16 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s34
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2213,16 +2213,16 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s34
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2250,8 +2250,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2351,8 +2351,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2382,16 +2382,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2416,16 +2416,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2453,8 +2453,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2554,8 +2554,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2935,9 +2935,9 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2949,9 +2949,9 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3020,9 +3020,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3034,9 +3034,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3297,9 +3297,9 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3311,9 +3311,9 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3382,9 +3382,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3396,9 +3396,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3841,18 +3841,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-LABEL: flat_atomic_max_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s34
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3876,18 +3876,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX8-LABEL: flat_atomic_max_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s34
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3914,10 +3914,10 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4018,10 +4018,10 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4050,18 +4050,18 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_max_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s7
; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4085,18 +4085,18 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-LABEL: flat_atomic_max_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4123,10 +4123,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4227,10 +4227,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5223,18 +5223,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s34
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5258,18 +5258,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s34
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5296,10 +5296,10 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5400,10 +5400,10 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5432,18 +5432,18 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s7
; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5467,18 +5467,18 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5505,10 +5505,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5609,10 +5609,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6503,18 +6503,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s34
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6538,18 +6538,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s34
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6576,10 +6576,10 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6680,10 +6680,10 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6712,18 +6712,18 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s7
; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6747,18 +6747,18 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6785,10 +6785,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6889,10 +6889,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7447,18 +7447,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-LABEL: flat_atomic_min_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v3, s34
; GFX7-NEXT: v_mov_b32_e32 v4, s35
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: flat_load_dword v3, v[3:4]
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7482,18 +7482,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX8-LABEL: flat_atomic_min_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s34
; GFX8-NEXT: v_mov_b32_e32 v4, s35
; GFX8-NEXT: flat_load_dword v2, v[0:1]
; GFX8-NEXT: flat_load_dword v3, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7520,10 +7520,10 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7624,10 +7624,10 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v6, s7
; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7656,18 +7656,18 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_min_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s35, s5, 0
; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_load_dword v0, v[0:1]
; GFX7-NEXT: flat_load_dword v1, v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v4, s7
; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7691,18 +7691,18 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX8-LABEL: flat_atomic_min_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s35, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7729,10 +7729,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7833,10 +7833,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8094,9 +8094,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v6, s3
; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8124,9 +8124,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8154,9 +8154,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_mov_b32_e32 v7, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8640,9 +8640,9 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8654,9 +8654,9 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8725,9 +8725,9 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -8739,9 +8739,9 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9002,9 +9002,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9016,9 +9016,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9087,9 +9087,9 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
; GFX7-NEXT: v_mov_b32_e32 v3, s35
; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9101,9 +9101,9 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index f490ecf68d9840..900983a37d92a9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -806,8 +806,8 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 64be9cb72a6ee3..89c7fea4fbfe6b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -81,16 +81,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
; VI-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-NEXT: s_load_dword s3, s[8:9], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: v_add_f32_e64 v2, s4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -140,16 +140,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s4, s0, 4
; VI-NEXT: v_add_f32_e64 v2, |s2|, |s2|
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -198,11 +198,11 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
; VI-NEXT: s_add_u32 s6, s4, 4
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0
; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
@@ -484,13 +484,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
+; VI-DENORM-NEXT: s_add_u32 s2, s0, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
+; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4
+; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0
; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4
-; VI-DENORM-NEXT: s_add_u32 s2, s0, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
@@ -505,16 +505,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4
; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4
; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3
; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0
; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
@@ -601,13 +601,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
+; VI-DENORM-NEXT: s_add_u32 s2, s0, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
+; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4|
+; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0
; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4|
-; VI-DENORM-NEXT: s_add_u32 s2, s0, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
@@ -622,13 +622,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
+; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
+; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4|
+; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0
; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4|
-; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0
; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
@@ -721,13 +721,13 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
+; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
; VI-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
+; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
@@ -744,13 +744,13 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
+; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
; VI-FLUSH-NEXT: v_mad_f16 v2, |s6|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
+; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index fe5601594dca8d..4f2959997c70e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -123,9 +123,9 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rndne_f32_e32 v1, s3
; VI-NEXT: v_rndne_f32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -166,10 +166,10 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_rndne_f32_e32 v2, s2
; VI-NEXT: v_rndne_f32_e32 v1, s1
; VI-NEXT: v_rndne_f32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -406,8 +406,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v11, s3
-; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: v_mov_b32_e32 v10, s2
+; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: v_mov_b32_e32 v8, s0
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 9a72fe96b5c3af..8811711a4c5c34 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3027,9 +3027,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec
; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
; SI-NEXT: s_cselect_b32 s0, 0, s0
-; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -3047,9 +3047,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
; VI-NEXT: s_cselect_b32 s0, 0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -3086,11 +3086,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
; SI-NEXT: s_bitcmp1_b32 s2, 16
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_short v[0:1], v2
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d6f6d440f9a835..0c2e249830b080 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -412,9 +412,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v3, s1
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
+; CIVI-NEXT: v_mov_b32_e32 v3, s1
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CIVI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 52b6d2cbaa6ebe..67cc78cd921d94 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -191,9 +191,9 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -227,11 +227,11 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s3, 31
; VI-NEXT: s_bitset1_b32 s1, 31
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -278,9 +278,9 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 17e509acfb6e63..d86f243bb1bdf6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -217,9 +217,9 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s3, 31
; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -257,11 +257,11 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: s_bitset1_b32 s2, 31
; VI-NEXT: s_bitset1_b32 s1, 31
; VI-NEXT: s_bitset1_b32 s0, 31
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index b2d30b751ae2c4..ffdab528b42a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1488,9 +1488,9 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 87f1303ab8f5d9..e7f04d76609388 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -67,9 +67,9 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -118,11 +118,11 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
; VI-NEXT: s_xor_b32 s1, s1, 0x80000000
; VI-NEXT: s_xor_b32 s0, s0, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 4216bdf409edaf..0347ab2416fa44 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -21,9 +21,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v2, 4.0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -49,9 +49,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v2, 4.0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index cc11e256d5544e..b9c8e413a111bb 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -28,8 +28,8 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -74,8 +74,8 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -155,8 +155,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -203,8 +203,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -248,8 +248,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -301,8 +301,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s6, s6
; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0
; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -355,8 +355,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2
; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0
; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -408,8 +408,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s6, s6
; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0
; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -458,8 +458,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -506,8 +506,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -562,8 +562,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0
; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1
; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -614,8 +614,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -678,8 +678,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -741,8 +741,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 5febd5256e7949..f2df6b7e178994 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -3018,9 +3018,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
@@ -3438,9 +3438,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_add_u32 s0, s4, 64
; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -4013,9 +4013,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_add_u32 s0, s4, 64
; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 43caa4c739fb3e..a77df552583eac 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -609,11 +609,11 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_mov_b32_e32 v1, s14
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31
+; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23
; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 5ca81ce9f9e073..1b9679f6fccb16 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -511,11 +511,11 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_mov_b32_e32 v1, s14
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1
+; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9
; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 6384fdba7a45a3..2d45715a804605 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1984,10 +1984,10 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
-; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 73b4428b03c81e..128688a919a69f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -7074,9 +7074,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -7109,9 +7109,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -7295,9 +7295,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -7330,9 +7330,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -7521,9 +7521,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -7560,9 +7560,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index cd6ed1e6b98c29..c2621857ab018f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4186,9 +4186,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -4221,9 +4221,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index b49047c54d7dd0..db86356941badd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4186,9 +4186,9 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -4221,9 +4221,9 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 5577029f502d08..392c25457b5460 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -3904,9 +3904,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3939,9 +3939,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -4155,9 +4155,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -4190,9 +4190,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -4411,9 +4411,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: v_mov_b32_e32 v11, v1
; GFX7-NEXT: v_mov_b32_e32 v10, v0
; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: v_mov_b32_e32 v3, v11
; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -4450,9 +4450,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: v_mov_b32_e32 v10, v0
; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v0, v8
; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
; GFX6-NEXT: v_mov_b32_e32 v3, v11
; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 492a30b67089c5..40b76db55ebbb6 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4732,8 +4732,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4852,8 +4852,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 3e15b135eeab98..42d28b03b2db16 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -4144,9 +4144,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4198,10 +4198,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out,
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4263,8 +4263,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_addc_u32 s1, s3, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4333,8 +4333,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
; VI-NEXT: s_addc_u32 s1, s1, s5
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4395,9 +4395,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4448,8 +4448,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4510,8 +4510,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_add_u32 s0, s2, s0
; VI-NEXT: s_addc_u32 s1, s3, s1
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4577,8 +4577,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_add_u32 s0, s0, s4
; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index f7882e6f120222..48619aadac2811 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -2714,8 +2714,8 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB57_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4478,8 +4478,8 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4658,8 +4658,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: s_load_dword s3, s[4:5], 0x10
; VI-NEXT: s_add_u32 s4, s4, 16
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -4764,8 +4764,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_load_dword s7, s[4:5], 0x10
; VI-NEXT: s_add_u32 s4, s4, 16
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -4869,8 +4869,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
; VI-NEXT: s_add_u32 s4, s0, s4
; VI-NEXT: s_addc_u32 s5, s1, s5
; VI-NEXT: s_load_dword s3, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -4972,8 +4972,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_add_u32 s4, s0, s4
; VI-NEXT: s_addc_u32 s5, s1, s5
; VI-NEXT: s_load_dword s7, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
@@ -5731,8 +5731,8 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5911,8 +5911,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dword s3, s[4:5], 0x10
; VI-NEXT: s_add_u32 s4, s4, 16
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -6017,8 +6017,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; VI-NEXT: s_load_dword s7, s[4:5], 0x10
; VI-NEXT: s_add_u32 s4, s4, 16
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -6130,8 +6130,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_add_u32 s4, s0, s4
; VI-NEXT: s_addc_u32 s5, s1, s5
; VI-NEXT: s_load_dword s7, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
@@ -6889,8 +6889,8 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB116_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7722,8 +7722,8 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7902,8 +7902,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: s_load_dword s3, s[4:5], 0x10
; VI-NEXT: s_add_u32 s4, s4, 16
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -8008,8 +8008,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_load_dword s7, s[4:5], 0x10
; VI-NEXT: s_add_u32 s4, s4, 16
; VI-NEXT: s_addc_u32 s5, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -8203,8 +8203,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_add_u32 s4, s0, s4
; VI-NEXT: s_addc_u32 s5, s1, s5
; VI-NEXT: s_load_dword s7, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index b1c68a06a818f9..53a52ade7e99c5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -140,10 +140,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -217,11 +217,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -428,10 +428,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -502,11 +502,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -708,10 +708,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -785,11 +785,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -996,10 +996,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1070,11 +1070,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1276,10 +1276,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1353,11 +1353,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1564,10 +1564,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1638,11 +1638,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1835,10 +1835,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_endpgm
@@ -1906,11 +1906,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2105,10 +2105,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@@ -2173,11 +2173,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2367,10 +2367,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_endpgm
@@ -2438,11 +2438,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2637,10 +2637,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@@ -2705,11 +2705,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2899,10 +2899,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_endpgm
@@ -2970,11 +2970,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3169,10 +3169,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@@ -3237,11 +3237,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3431,10 +3431,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_endpgm
@@ -3502,11 +3502,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3701,10 +3701,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_endpgm
@@ -3769,11 +3769,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3972,10 +3972,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4049,11 +4049,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4260,10 +4260,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4334,11 +4334,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4650,10 +4650,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4727,11 +4727,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -4938,10 +4938,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5012,11 +5012,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5218,10 +5218,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5295,11 +5295,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5506,10 +5506,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5580,11 +5580,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: s_mov_b32 s8, s2
; CI-NEXT: s_mov_b32 s9, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s11
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5878,12 +5878,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
; CI-NEXT: s_mov_b32 s10, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: s_mov_b64 s[8:9], s[0:1]
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_mov_b32_e32 v2, s6
; CI-NEXT: v_mov_b32_e32 v3, s7
+; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v5, s5
; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5899,11 +5899,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_addc_u32 s1, s1, s5
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -5957,7 +5957,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3
-; CI-NEXT: v_mov_b32_e32 v4, s6
; CI-NEXT: s_mov_b32 s0, s10
; CI-NEXT: s_mov_b32 s1, s11
; CI-NEXT: s_mov_b32 s10, 0
@@ -5966,6 +5965,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; CI-NEXT: v_mov_b32_e32 v1, s13
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: v_mov_b32_e32 v4, s6
; CI-NEXT: v_mov_b32_e32 v5, s7
; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -5984,10 +5984,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; VI-NEXT: s_addc_u32 s3, s9, s3
; VI-NEXT: s_add_u32 s2, s0, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6198,11 +6198,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[8:9], s[0:1]
; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_mov_b32_e32 v2, s6
; CI-NEXT: v_mov_b32_e32 v3, s7
+; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -6216,11 +6216,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
; VI-NEXT: s_add_u32 s0, s0, s4
; VI-NEXT: s_addc_u32 s1, s1, s5
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6273,7 +6273,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3
-; CI-NEXT: v_mov_b32_e32 v4, s6
; CI-NEXT: s_mov_b32 s0, s10
; CI-NEXT: s_mov_b32 s1, s11
; CI-NEXT: s_mov_b32 s10, 0
@@ -6282,6 +6281,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v1, s13
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: v_mov_b32_e32 v4, s6
; CI-NEXT: v_mov_b32_e32 v5, s7
; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -6297,11 +6297,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; VI-NEXT: s_lshl_b64 s[2:3], s[14:15], 3
; VI-NEXT: s_add_u32 s2, s8, s2
; VI-NEXT: s_addc_u32 s3, s9, s3
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6555,10 +6555,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b32 s4, s2
; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3
-; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: s_mov_b32 s5, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s7
+; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -6636,10 +6636,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b32 s4, s2
; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3
-; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: s_mov_b32 s5, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s7
+; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -6714,10 +6714,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b32 s4, s2
; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3
-; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: s_mov_b32 s5, s3
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: s_mov_b32 s3, s7
+; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -6897,8 +6897,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
; CI-NEXT: s_endpgm
@@ -6966,8 +6966,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
@@ -7032,8 +7032,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
; CI-NEXT: s_endpgm
@@ -7226,10 +7226,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -7429,10 +7429,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
; CI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 59a99a6a0328d4..bad501ae3853d5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -234,9 +234,9 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -343,9 +343,9 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -672,9 +672,9 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -781,9 +781,9 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1110,9 +1110,9 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1219,9 +1219,9 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1548,9 +1548,9 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1657,9 +1657,9 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1986,9 +1986,9 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2095,9 +2095,9 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2626,8 +2626,8 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB54_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2837,8 +2837,8 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB56_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3416,9 +3416,9 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3525,9 +3525,9 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3854,9 +3854,9 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3963,9 +3963,9 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -4483,10 +4483,10 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB84_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4700,10 +4700,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB86_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -4915,12 +4915,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: .LBB88_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5034,12 +5034,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5153,10 +5153,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-NEXT: s_add_u32 s4, s0, s4
; VI-NEXT: s_addc_u32 s5, s1, s5
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
@@ -5269,10 +5269,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_add_u32 s6, s0, s6
; VI-NEXT: s_addc_u32 s7, s1, s7
; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v3, s9
@@ -5945,10 +5945,10 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB98_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6162,10 +6162,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB100_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6377,12 +6377,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: .LBB102_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6496,12 +6496,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6619,10 +6619,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_add_u32 s6, s0, s6
; VI-NEXT: s_addc_u32 s7, s1, s7
; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v3, s9
@@ -7295,10 +7295,10 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB111_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7512,10 +7512,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB113_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8281,10 +8281,10 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: .LBB121_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8498,10 +8498,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: .LBB123_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8713,12 +8713,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: .LBB125_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8832,12 +8832,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8946,9 +8946,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
@@ -9056,10 +9056,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_add_u32 s6, s0, s6
; VI-NEXT: s_addc_u32 s7, s1, s7
; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v3, s9
@@ -9541,9 +9541,9 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -9650,9 +9650,9 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -9979,9 +9979,9 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -10088,9 +10088,9 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s34
; VI-NEXT: v_mov_b32_e32 v3, s35
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index fbe06b3651b06c..857d9943a586e3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -452,14 +452,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -800,14 +800,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1672,14 +1672,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2020,14 +2020,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2952,14 +2952,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -3300,14 +3300,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3728,14 +3728,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -4076,14 +4076,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5007,14 +5007,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -5381,14 +5381,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5845,21 +5845,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
+; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0
; GFX9-NEXT: s_mov_b32 s42, s9
; GFX9-NEXT: s_mov_b32 s43, s8
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
@@ -5872,18 +5872,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -5923,10 +5923,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-NEXT: s_mov_b32 s42, s9
+; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1064-NEXT: s_mov_b32 s42, s9
; GFX1064-NEXT: s_mov_b32 s43, s8
; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -5960,13 +5960,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6003,10 +6003,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9
-; GFX1032-NEXT: s_mov_b32 s33, s10
+; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b32 s43, s8
; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6039,13 +6039,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6304,21 +6304,21 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
+; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0
; GFX9-DPP-NEXT: s_mov_b32 s42, s9
; GFX9-DPP-NEXT: s_mov_b32 s43, s8
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
@@ -6331,18 +6331,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -6382,10 +6382,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-DPP-NEXT: s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1064-DPP-NEXT: s_mov_b32 s42, s9
; GFX1064-DPP-NEXT: s_mov_b32 s43, s8
; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6419,13 +6419,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6462,10 +6462,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b32 s43, s8
; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6498,13 +6498,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6798,16 +6798,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -6848,18 +6848,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -6904,15 +6904,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -6962,13 +6962,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7010,15 +7010,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -7067,13 +7067,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7393,16 +7393,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7471,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -7527,15 +7527,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7604,13 +7604,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7652,15 +7652,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7723,13 +7723,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -8600,14 +8600,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -8990,14 +8990,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10033,14 +10033,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -10423,14 +10423,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10948,14 +10948,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -11338,14 +11338,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11885,9 +11885,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
@@ -11900,18 +11900,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -11988,13 +11988,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12067,13 +12067,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12360,9 +12360,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
@@ -12375,18 +12375,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -12463,13 +12463,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12542,13 +12542,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12854,16 +12854,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -12904,18 +12904,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -12960,15 +12960,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -13018,13 +13018,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -13066,15 +13066,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -13123,13 +13123,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -13449,16 +13449,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13527,18 +13527,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -13583,15 +13583,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13660,13 +13660,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -13708,15 +13708,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13779,13 +13779,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 77924222919984..488728deb084c4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -364,14 +364,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -702,14 +702,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1403,14 +1403,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1741,14 +1741,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2442,14 +2442,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2780,14 +2780,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3238,9 +3238,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
@@ -3253,17 +3253,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -3338,13 +3338,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -3414,13 +3414,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -3678,9 +3678,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
@@ -3693,17 +3693,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -3778,13 +3778,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -3854,13 +3854,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -4152,16 +4152,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -4205,17 +4205,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
@@ -4262,15 +4262,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -4323,13 +4323,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -4372,15 +4372,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -4432,13 +4432,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -4770,16 +4770,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -4856,17 +4856,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
@@ -4913,15 +4913,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -4997,13 +4997,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
@@ -5048,15 +5048,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5126,13 +5126,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -5846,14 +5846,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -6232,14 +6232,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6786,9 +6786,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
@@ -6801,17 +6801,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6886,13 +6886,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -6962,13 +6962,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -7226,9 +7226,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
@@ -7241,17 +7241,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7326,13 +7326,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -7402,13 +7402,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -7700,16 +7700,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -7753,17 +7753,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
@@ -7810,15 +7810,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -7871,13 +7871,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -7920,15 +7920,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -7980,13 +7980,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -8318,16 +8318,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8404,17 +8404,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
@@ -8461,15 +8461,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8545,13 +8545,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
@@ -8596,15 +8596,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8674,13 +8674,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index cb3291df891af4..94eaac147a9b69 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -364,14 +364,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -702,14 +702,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1403,14 +1403,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1741,14 +1741,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2442,14 +2442,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2780,14 +2780,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3238,9 +3238,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
@@ -3253,17 +3253,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -3338,13 +3338,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -3414,13 +3414,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -3678,9 +3678,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
@@ -3693,17 +3693,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -3778,13 +3778,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -3854,13 +3854,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -4152,16 +4152,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -4205,17 +4205,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
@@ -4262,15 +4262,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -4323,13 +4323,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -4372,15 +4372,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -4432,13 +4432,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -4770,16 +4770,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -4856,17 +4856,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
@@ -4913,15 +4913,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -4997,13 +4997,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
@@ -5048,15 +5048,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5126,13 +5126,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -5846,14 +5846,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -6232,14 +6232,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6786,9 +6786,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
@@ -6801,17 +6801,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6886,13 +6886,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -6962,13 +6962,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -7226,9 +7226,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
@@ -7241,17 +7241,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7326,13 +7326,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -7402,13 +7402,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
@@ -7700,16 +7700,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -7753,17 +7753,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
@@ -7810,15 +7810,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -7871,13 +7871,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -7920,15 +7920,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -7980,13 +7980,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
@@ -8318,16 +8318,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8404,17 +8404,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
@@ -8461,15 +8461,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8545,13 +8545,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
@@ -8596,15 +8596,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8674,13 +8674,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6dc3a1971a485f..62bca0d420ef4d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -512,14 +512,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -886,14 +886,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1844,14 +1844,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2218,14 +2218,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3176,14 +3176,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -3550,14 +3550,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -4004,14 +4004,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -4378,14 +4378,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -5335,14 +5335,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -5709,14 +5709,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6173,21 +6173,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
+; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0
; GFX9-NEXT: s_mov_b32 s42, s9
; GFX9-NEXT: s_mov_b32 s43, s8
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
@@ -6200,18 +6200,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -6251,10 +6251,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-NEXT: s_mov_b32 s42, s9
+; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1064-NEXT: s_mov_b32 s42, s9
; GFX1064-NEXT: s_mov_b32 s43, s8
; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6288,13 +6288,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6331,10 +6331,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9
-; GFX1032-NEXT: s_mov_b32 s33, s10
+; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b32 s43, s8
; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6367,13 +6367,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6632,21 +6632,21 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
+; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0
; GFX9-DPP-NEXT: s_mov_b32 s42, s9
; GFX9-DPP-NEXT: s_mov_b32 s43, s8
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
@@ -6659,18 +6659,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -6710,10 +6710,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11]
-; GFX1064-DPP-NEXT: s_mov_b32 s42, s9
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1064-DPP-NEXT: s_mov_b32 s42, s9
; GFX1064-DPP-NEXT: s_mov_b32 s43, s8
; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6747,13 +6747,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -6790,10 +6790,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9
-; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24
; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b32 s43, s8
; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
@@ -6826,13 +6826,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7126,16 +7126,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -7176,18 +7176,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -7232,15 +7232,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -7290,13 +7290,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7338,15 +7338,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -7395,13 +7395,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7721,16 +7721,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7799,18 +7799,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -7855,15 +7855,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -7932,13 +7932,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -7980,15 +7980,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -8051,13 +8051,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -8927,14 +8927,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9317,14 +9317,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10360,14 +10360,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -10750,14 +10750,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11275,14 +11275,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -11665,14 +11665,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT: s_mov_b32 s14, s10
-; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b32 s14, s10
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-DPP-NEXT: s_mov_b32 s32, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12211,9 +12211,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_mov_b64 s[46:47], 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
@@ -12226,18 +12226,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -12314,13 +12314,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12393,13 +12393,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12686,9 +12686,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
@@ -12701,18 +12701,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -12789,13 +12789,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -12868,13 +12868,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -13180,16 +13180,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b32 s33, s10
; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -13230,18 +13230,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s43
; GFX9-NEXT: s_mov_b32 s13, s42
; GFX9-NEXT: s_mov_b32 s14, s33
; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -13286,15 +13286,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s33, s10
; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
-; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s13, s42
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -13344,13 +13344,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-NEXT: s_mov_b32 s12, s43
; GFX1064-NEXT: s_mov_b32 s13, s42
; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -13392,15 +13392,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_mov_b32 s33, s10
; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
-; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s13, s42
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: s_movk_i32 s32, 0x400
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -13449,13 +13449,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-NEXT: s_mov_b32 s12, s43
; GFX1032-NEXT: s_mov_b32 s13, s42
; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -13775,16 +13775,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-DPP-NEXT: s_mov_b32 s33, s10
; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7]
-; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13853,18 +13853,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4
; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0
; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12
; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8
-; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-DPP-NEXT: s_mov_b32 s12, s43
; GFX9-DPP-NEXT: s_mov_b32 s13, s42
; GFX9-DPP-NEXT: s_mov_b32 s14, s33
; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53]
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55]
; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
@@ -13909,15 +13909,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_mov_b32 s33, s10
; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13986,13 +13986,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1064-DPP-NEXT: s_mov_b32 s12, s43
; GFX1064-DPP-NEXT: s_mov_b32 s13, s42
; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
@@ -14034,15 +14034,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_mov_b32 s33, s10
; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7]
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3]
-; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14105,13 +14105,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
; GFX1032-DPP-NEXT: s_mov_b32 s12, s43
; GFX1032-DPP-NEXT: s_mov_b32 s13, s42
; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index a2fca33af10464..91682f8cb1f168 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -84,10 +84,10 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_add_u32 s4, s0, 4
; CIVI-NEXT: s_addc_u32 s5, s1, 0
-; CIVI-NEXT: v_mov_b32_e32 v2, s4
; CIVI-NEXT: v_mov_b32_e32 v4, s3
-; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: v_mov_b32_e32 v2, s4
; CIVI-NEXT: v_mov_b32_e32 v3, s5
+; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: v_mov_b32_e32 v5, s2
; CIVI-NEXT: flat_store_short v[2:3], v4
@@ -116,8 +116,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: v_mov_b32_e32 v3, s3
; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CIVI-NEXT: s_endpgm
@@ -125,9 +125,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; GFX11-LABEL: load_v4f16_arg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
store <4 x half> %arg, ptr addrspace(1) %out
@@ -141,8 +141,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v4, s4
-; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v5, s5
+; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
@@ -155,8 +155,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -168,10 +168,10 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX11-NEXT: s_endpgm
store <8 x half> %arg, ptr addrspace(1) %out
@@ -403,19 +403,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s1, 16
-; CI-NEXT: s_lshr_b32 s7, s0, 16
; CI-NEXT: s_lshr_b32 s8, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v3, s6
; CI-NEXT: s_lshr_b32 s6, s2, 16
+; CI-NEXT: s_lshr_b32 s7, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v7, s8
; CI-NEXT: v_cvt_f32_f16_e32 v5, s6
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: v_cvt_f32_f16_e32 v6, s3
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT: s_add_u32 s0, s4, 16
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; CI-NEXT: s_add_u32 s0, s4, 16
; CI-NEXT: s_addc_u32 s1, s5, 0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
; CI-NEXT: v_mov_b32_e32 v9, s1
; CI-NEXT: v_mov_b32_e32 v8, s0
; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
@@ -431,19 +431,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s1, 16
-; VI-NEXT: s_lshr_b32 s7, s0, 16
; VI-NEXT: s_lshr_b32 s8, s3, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s6
; VI-NEXT: s_lshr_b32 s6, s2, 16
+; VI-NEXT: s_lshr_b32 s7, s0, 16
; VI-NEXT: v_cvt_f32_f16_e32 v7, s8
; VI-NEXT: v_cvt_f32_f16_e32 v5, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: v_cvt_f32_f16_e32 v6, s3
; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, 16
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: v_mov_b32_e32 v8, s0
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
@@ -583,15 +583,15 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: s_lshr_b32 s4, s2, 16
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
-; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_mov_b32_e32 v7, s3
; CI-NEXT: v_mov_b32_e32 v6, s2
; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
@@ -604,15 +604,15 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: s_lshr_b32 s4, s2, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s4
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
-; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
@@ -650,17 +650,17 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
+; CI-NEXT: s_lshr_b32 s5, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
-; CI-NEXT: s_lshr_b32 s5, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
; CI-NEXT: v_cvt_f32_f16_e32 v6, s5
-; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_mov_b32_e32 v9, s3
; CI-NEXT: v_mov_b32_e32 v8, s2
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -675,17 +675,17 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s3, 16
+; VI-NEXT: s_lshr_b32 s4, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
; VI-NEXT: v_cvt_f32_f16_e32 v2, s5
-; VI-NEXT: s_lshr_b32 s4, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
; VI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -732,32 +732,33 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; CI-NEXT: v_cvt_f32_f16_e32 v12, s3
; CI-NEXT: s_lshr_b32 s7, s2, 16
; CI-NEXT: s_lshr_b32 s8, s1, 16
-; CI-NEXT: s_lshr_b32 s6, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
; CI-NEXT: v_cvt_f32_f16_e32 v8, s2
; CI-NEXT: v_cvt_f32_f16_e32 v9, s0
-; CI-NEXT: s_add_u32 s0, s4, 48
+; CI-NEXT: v_cvt_f32_f16_e32 v4, s8
+; CI-NEXT: s_lshr_b32 s6, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0
; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; CI-NEXT: s_add_u32 s0, s4, 48
; CI-NEXT: s_addc_u32 s1, s5, 0
-; CI-NEXT: v_cvt_f32_f16_e32 v4, s8
-; CI-NEXT: v_mov_b32_e32 v17, s1
-; CI-NEXT: v_mov_b32_e32 v16, s0
-; CI-NEXT: s_add_u32 s0, s4, 32
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; CI-NEXT: v_mov_b32_e32 v17, s1
+; CI-NEXT: v_mov_b32_e32 v16, s0
+; CI-NEXT: s_add_u32 s0, s4, 32
+; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
+; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; CI-NEXT: s_addc_u32 s1, s5, 0
+; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
+; CI-NEXT: s_nop 0
; CI-NEXT: v_mov_b32_e32 v13, s1
-; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
; CI-NEXT: v_mov_b32_e32 v12, s0
; CI-NEXT: s_add_u32 s0, s4, 16
; CI-NEXT: s_addc_u32 s1, s5, 0
-; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; CI-NEXT: s_nop 0
; CI-NEXT: v_mov_b32_e32 v9, s1
@@ -774,37 +775,38 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: s_lshr_b32 s8, s2, 16
+; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: s_lshr_b32 s9, s3, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s6
; VI-NEXT: v_cvt_f32_f16_e32 v4, s8
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s6
; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
; VI-NEXT: v_cvt_f32_f16_e32 v12, s3
; VI-NEXT: s_lshr_b32 s7, s1, 16
-; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: v_cvt_f32_f16_e32 v8, s2
-; VI-NEXT: s_add_u32 s0, s4, 48
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4
-; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5
; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
+; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; VI-NEXT: s_add_u32 s0, s4, 48
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
+; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
; VI-NEXT: v_mov_b32_e32 v17, s1
; VI-NEXT: v_mov_b32_e32 v16, s0
; VI-NEXT: s_add_u32 s0, s4, 32
-; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
+; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
+; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v13, s1
-; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; VI-NEXT: v_mov_b32_e32 v12, s0
; VI-NEXT: s_add_u32 s0, s4, 16
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v9, s1
@@ -1253,10 +1255,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 16
-; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: s_addc_u32 s5, s3, 0
-; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
+; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1266,6 +1268,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-NEXT: v_mov_b32_e32 v13, s2
; CI-NEXT: s_add_u32 s2, s0, 48
; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_mov_b32_e32 v15, s3
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f32_f16_e32 v8, v1
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1287,19 +1290,18 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; CI-NEXT: v_cvt_f32_f16_e32 v2, v5
; CI-NEXT: v_cvt_f32_f16_e32 v0, v4
-; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_cvt_f32_f16_e32 v9, v1
; CI-NEXT: v_cvt_f32_f16_e32 v13, v3
; CI-NEXT: v_cvt_f32_f16_e32 v3, v16
; CI-NEXT: v_cvt_f32_f16_e32 v1, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: s_add_u32 s0, s0, 32
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_mov_b32_e32 v15, s3
-; CI-NEXT: v_mov_b32_e32 v17, s1
+; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v14, s2
+; CI-NEXT: v_mov_b32_e32 v17, s1
; CI-NEXT: v_mov_b32_e32 v16, s0
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
@@ -1323,8 +1325,8 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-NEXT: v_mov_b32_e32 v19, s3
; VI-NEXT: v_mov_b32_e32 v18, s2
; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: v_mov_b32_e32 v17, s1
; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v17, s1
; VI-NEXT: v_mov_b32_e32 v16, s0
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
@@ -1529,6 +1531,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1538,7 +1541,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
-; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7]
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -1580,6 +1582,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_mov_b32_e32 v11, s3
; CI-NEXT: v_mov_b32_e32 v9, s1
; CI-NEXT: v_mov_b32_e32 v8, s0
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1593,7 +1596,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10
-; CI-NEXT: v_mov_b32_e32 v11, s3
; CI-NEXT: v_mov_b32_e32 v10, s2
; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -1608,6 +1610,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v11, s3
; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: v_mov_b32_e32 v8, s0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1619,7 +1622,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10
-; VI-NEXT: v_mov_b32_e32 v11, s3
; VI-NEXT: v_mov_b32_e32 v10, s2
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -1668,13 +1670,13 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v7, s3
; CI-NEXT: v_mov_b32_e32 v6, s2
; CI-NEXT: s_add_u32 s2, s0, 32
-; CI-NEXT: v_mov_b32_e32 v13, s1
; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_mov_b32_e32 v13, s1
; CI-NEXT: v_mov_b32_e32 v12, s0
; CI-NEXT: s_add_u32 s0, s0, 16
; CI-NEXT: v_mov_b32_e32 v15, s3
-; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v14, s2
+; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -1716,13 +1718,13 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v8, s3
; VI-NEXT: v_mov_b32_e32 v7, s2
; VI-NEXT: s_add_u32 s2, s0, 32
-; VI-NEXT: v_mov_b32_e32 v13, s1
; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v13, s1
; VI-NEXT: v_mov_b32_e32 v12, s0
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: v_mov_b32_e32 v15, s3
-; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v14, s2
+; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v9, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1812,31 +1814,30 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_mov_b32_e32 v19, s3
; CI-NEXT: v_mov_b32_e32 v18, s2
-; CI-NEXT: s_add_u32 s2, s0, 0x70
-; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_mov_b32_e32 v13, s1
; CI-NEXT: v_mov_b32_e32 v12, s0
+; CI-NEXT: s_add_u32 s2, s0, 0x70
+; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_cvt_f32_f16_e32 v10, v8
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5
; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v5
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5
; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
-; CI-NEXT: v_mov_b32_e32 v15, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v5
; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
-; CI-NEXT: v_mov_b32_e32 v14, s2
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v9, v0
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
@@ -1849,33 +1850,34 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
-; CI-NEXT: s_add_u32 s2, s0, 0x60
; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
-; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; CI-NEXT: v_mov_b32_e32 v17, s3
+; CI-NEXT: v_mov_b32_e32 v15, s3
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
; CI-NEXT: v_cvt_f32_f16_e32 v7, v20
+; CI-NEXT: v_mov_b32_e32 v14, s2
+; CI-NEXT: s_add_u32 s2, s0, 0x60
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v12, v5
+; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_mov_b32_e32 v17, s3
; CI-NEXT: v_mov_b32_e32 v16, s2
; CI-NEXT: s_add_u32 s2, s0, 0x50
-; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; CI-NEXT: s_add_u32 s0, s0, 64
+; CI-NEXT: s_addc_u32 s3, s1, 0
; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
-; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: s_add_u32 s0, s0, 64
; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21
; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12
+; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v19, s3
-; CI-NEXT: v_mov_b32_e32 v13, s1
; CI-NEXT: v_mov_b32_e32 v18, s2
+; CI-NEXT: v_mov_b32_e32 v13, s1
; CI-NEXT: v_mov_b32_e32 v12, s0
; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
@@ -1906,10 +1908,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v18, s3
; VI-NEXT: v_mov_b32_e32 v17, s2
-; VI-NEXT: s_add_u32 s2, s0, 0x50
; VI-NEXT: v_mov_b32_e32 v12, s1
-; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_mov_b32_e32 v11, s0
+; VI-NEXT: s_add_u32 s2, s0, 0x50
+; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_f16_e32 v8, v7
; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1954,15 +1956,15 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
-; VI-NEXT: s_add_u32 s0, s0, 0x60
; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4]
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s0, 0x60
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8
+; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v20, s3
-; VI-NEXT: v_mov_b32_e32 v14, s1
; VI-NEXT: v_mov_b32_e32 v19, s2
+; VI-NEXT: v_mov_b32_e32 v14, s1
; VI-NEXT: v_mov_b32_e32 v13, s0
; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
@@ -2137,8 +2139,8 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: flat_store_short v[0:1], v2
-; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_or_b32_e32 v2, v4, v3
+; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -2153,14 +2155,14 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v4, v0
+; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_or_b32_e32 v3, v4, v3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_or_b32_e32 v3, v4, v3
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
@@ -2360,10 +2362,10 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-NEXT: s_add_u32 s4, s2, 48
; CI-NEXT: s_addc_u32 s5, s3, 0
; CI-NEXT: v_mov_b32_e32 v9, s3
-; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v8, s2
; CI-NEXT: s_add_u32 s2, s2, 16
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v5, s5
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; CI-NEXT: s_addc_u32 s3, s3, 0
@@ -2394,11 +2396,11 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
; CI-NEXT: v_or_b32_e32 v1, v2, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16
+; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_or_b32_e32 v0, v0, v18
; CI-NEXT: v_or_b32_e32 v3, v6, v2
@@ -2428,9 +2430,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-NEXT: s_add_u32 s4, s2, 48
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v9, s3
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: s_add_u32 s2, s2, 16
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
@@ -2712,50 +2714,50 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: v_cvt_f32_f16_e32 v8, s0
; CI-NEXT: s_lshr_b32 s0, s5, 16
-; CI-NEXT: s_lshr_b32 s11, s1, 16
; CI-NEXT: v_cvt_f32_f16_e32 v9, s0
; CI-NEXT: s_lshr_b32 s0, s6, 16
+; CI-NEXT: s_lshr_b32 s11, s1, 16
; CI-NEXT: s_lshr_b32 s12, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s10
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s11
; CI-NEXT: s_lshr_b32 s10, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v10, s0
; CI-NEXT: s_lshr_b32 s0, s7, 16
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s11
; CI-NEXT: v_cvt_f32_f16_e32 v2, s12
; CI-NEXT: v_cvt_f32_f16_e32 v3, s10
-; CI-NEXT: v_cvt_f32_f16_e32 v5, s1
; CI-NEXT: v_cvt_f32_f16_e32 v11, s0
-; CI-NEXT: v_cvt_f32_f16_e32 v12, s4
-; CI-NEXT: v_cvt_f32_f16_e32 v13, s5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, s1
; CI-NEXT: v_cvt_f32_f16_e32 v6, s2
; CI-NEXT: v_cvt_f32_f16_e32 v7, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v12, s4
+; CI-NEXT: v_cvt_f32_f16_e32 v13, s5
; CI-NEXT: v_cvt_f32_f16_e32 v14, s7
; CI-NEXT: v_cvt_f32_f16_e32 v15, s6
-; CI-NEXT: v_add_f32_e32 v1, v1, v9
-; CI-NEXT: v_add_f32_e32 v0, v0, v8
; CI-NEXT: v_add_f32_e32 v3, v3, v11
; CI-NEXT: v_add_f32_e32 v2, v2, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_add_f32_e32 v5, v5, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_add_f32_e32 v4, v4, v12
+; CI-NEXT: v_add_f32_e32 v1, v1, v9
+; CI-NEXT: v_add_f32_e32 v0, v0, v8
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_add_f32_e32 v7, v7, v14
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_add_f32_e32 v6, v6, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_add_f32_e32 v5, v5, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_add_f32_e32 v4, v4, v12
; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v3, v7, v3
+; CI-NEXT: v_or_b32_e32 v2, v6, v2
; CI-NEXT: v_or_b32_e32 v1, v5, v1
; CI-NEXT: v_or_b32_e32 v0, v4, v0
; CI-NEXT: v_mov_b32_e32 v4, s8
-; CI-NEXT: v_or_b32_e32 v3, v7, v3
-; CI-NEXT: v_or_b32_e32 v2, v6, v2
; CI-NEXT: v_mov_b32_e32 v5, s9
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5dff660912e402..0872862d84d204 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -218,8 +218,8 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: ; %bb.5: ; %bb43
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s9, s8
-; CHECK-NEXT: v_mov_b32_e32 v0, s8
; CHECK-NEXT: v_readlane_b32 s36, v7, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s8
; CHECK-NEXT: v_mov_b32_e32 v1, s9
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
@@ -287,11 +287,11 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s6, s8
; CHECK-NEXT: s_mov_b32 s7, s8
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: v_readlane_b32 s36, v7, 56
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: v_mov_b32_e32 v2, s7
; CHECK-NEXT: v_readlane_b32 s37, v7, 57
; CHECK-NEXT: v_readlane_b32 s38, v7, 58
@@ -304,7 +304,6 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2
-; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
; CHECK-NEXT: s_and_b64 vcc, exec, 0
; CHECK-NEXT: v_readlane_b32 s44, v6, 0
; CHECK-NEXT: v_readlane_b32 s45, v6, 1
@@ -314,6 +313,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s49, v6, 5
; CHECK-NEXT: v_readlane_b32 s50, v6, 6
; CHECK-NEXT: v_readlane_b32 s51, v6, 7
+; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 3d27b5fe7f30b3..3cc6351106bd10 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -263,11 +263,11 @@ define amdgpu_kernel void @llvm_debugtrap() {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
+; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
-; GFX8V4-NEXT: s_add_u32 s0, s8, 8
-; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V4-NEXT: s_addc_u32 s1, s9, 0
+; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
@@ -288,11 +288,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
+; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
-; GFX8V5-NEXT: s_add_u32 s0, s8, 8
-; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_addc_u32 s1, s9, 0
+; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 74020c43a3ca3f..cfb6245ecdec0c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -3522,7 +3522,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr
; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 4
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 5
; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 6
@@ -3535,6 +3534,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr
; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 13
; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 14
; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16
@@ -3608,6 +3608,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr
; GFX9-IDXMODE: ; %bb.0: ; %entry
; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x34
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3
@@ -3623,10 +3624,9 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 13
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 14
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15
; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12
@@ -3839,7 +3839,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in,
; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
-; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11
@@ -3855,6 +3854,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in,
; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s21
; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s22
; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s23
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5
@@ -4357,9 +4357,9 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
; VI-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc
@@ -4370,16 +4370,16 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
; VI-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s0, 48
; VI-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
-; VI-NEXT: v_mov_b32_e32 v17, s3
+; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; VI-NEXT: v_mov_b32_e32 v17, s3
; VI-NEXT: v_mov_b32_e32 v16, s2
; VI-NEXT: s_add_u32 s2, s0, 32
; VI-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc
@@ -6952,8 +6952,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
; SI-MOVREL-NEXT: s_mov_b32 m0, s3
-; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
; SI-MOVREL-NEXT: s_add_i32 s2, s2, 2
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
@@ -7010,9 +7010,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-MOVREL-NEXT: s_mov_b32 m0, s3
; VI-MOVREL-NEXT: s_add_i32 s2, s2, 2
; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
-; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12
@@ -7045,23 +7045,24 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
-; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70
; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
+; VI-MOVREL-NEXT: s_nop 0
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
+; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
-; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50
-; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
@@ -7096,8 +7097,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
; VI-IDXMODE-NEXT: s_set_gpr_idx_off
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14
; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13
; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12
@@ -7133,23 +7134,24 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
-; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70
; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
+; VI-IDXMODE-NEXT: s_nop 0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
-; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50
-; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
@@ -7184,8 +7186,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o
; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
-; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13
; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12
@@ -8083,16 +8085,16 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5
; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32
; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0
+; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16
; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4
-; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5
; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0
+; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
; VI-MOVREL-NEXT: s_mov_b32 m0, s6
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
@@ -8111,17 +8113,17 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out
; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48
; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5
; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
+; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4
-; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5
; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
@@ -8347,16 +8349,16 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5
; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32
; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0
+; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16
; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4
-; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5
; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0
+; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
; VI-MOVREL-NEXT: s_mov_b32 m0, s6
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
@@ -8375,17 +8377,17 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p
; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48
; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5
; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
+; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4
-; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5
; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
@@ -8612,17 +8614,17 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5
; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32
; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0
+; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16
; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4
-; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5
; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
+; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2
; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3
-; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0
-; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2
; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2
; VI-MOVREL-NEXT: s_mov_b32 m0, s0
; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
@@ -8640,17 +8642,17 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out,
; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48
; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5
; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32
-; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
+; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4
-; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16
; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5
; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0
; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
@@ -8899,9 +8901,9 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000
; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; VI-MOVREL-NEXT: s_lshl_b32 s2, s2, 2
-; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
; VI-MOVREL-NEXT: s_mov_b32 m0, s2
; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8
; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9
; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10
; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index add8c0f75bf335..7ed3179f4b0e05 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -23,8 +23,8 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -68,11 +68,11 @@ define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i3
; GCN-NEXT: s_cselect_b32 s1, s1, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s0, s0, 1
-; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -95,8 +95,8 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -116,7 +116,6 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
@@ -124,8 +123,9 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
-; GCN-NEXT: v_mov_b32_e32 v9, s3
+; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
+; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
@@ -149,7 +149,6 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
@@ -166,6 +165,7 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v
; GCN-NEXT: v_mov_b32_e32 v14, s22
; GCN-NEXT: v_mov_b32_e32 v15, s23
; GCN-NEXT: s_mov_b32 m0, s4
+; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v16, s2
; GCN-NEXT: s_add_u32 s2, s0, 32
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
@@ -204,7 +204,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x70
; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v33, s3
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
@@ -236,6 +235,7 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v
; GCN-NEXT: v_mov_b32_e32 v29, s21
; GCN-NEXT: v_mov_b32_e32 v30, s22
; GCN-NEXT: v_mov_b32_e32 v31, s23
+; GCN-NEXT: v_mov_b32_e32 v33, s3
; GCN-NEXT: v_mov_b32_e32 v32, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x60
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
@@ -300,8 +300,8 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec,
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
@@ -429,8 +429,8 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec,
; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
@@ -453,8 +453,8 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
@@ -551,11 +551,11 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-NEXT: s_or_b32 s0, s0, s8
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_or_b32 s0, s0, s7
-; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -578,11 +578,11 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v
; GCN-NEXT: s_cmp_eq_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s1, 0x3ff00000, s1
; GCN-NEXT: s_cselect_b32 s0, 0, s0
-; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -619,13 +619,13 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v
; GCN-NEXT: s_add_u32 s0, s10, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_addc_u32 s1, s11, 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_add_u32 s0, s10, 32
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NEXT: s_add_u32 s0, s10, 32
; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s2
@@ -634,9 +634,9 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v
; GCN-NEXT: s_addc_u32 s1, s11, 0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -735,17 +735,18 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v
; GCN-NEXT: s_addc_u32 s1, s7, 0
; GCN-NEXT: v_mov_b32_e32 v15, s1
; GCN-NEXT: v_mov_b32_e32 v14, s0
-; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GCN-NEXT: s_add_u32 s0, s6, 48
+; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
+; GCN-NEXT: s_addc_u32 s1, s7, 0
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s7
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: s_addc_u32 s1, s7, 0
+; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_add_u32 s0, s6, 32
-; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13]
; GCN-NEXT: s_addc_u32 s1, s7, 0
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13]
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
@@ -929,17 +930,18 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_add_u32 s0, s0, 0x60
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_add_u32 s0, s0, 0x60
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29]
-; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
@@ -1835,9 +1837,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_readlane_b32 s0, v6, 0
; GCN-NEXT: v_readlane_b32 s1, v6, 1
-; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 72cda5c718f5b2..20a2ff9733e82a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1980,10 +1980,10 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: .LBB42_2: ; %if
; SI-NEXT: s_load_dword s5, s[2:3], 0x0
; SI-NEXT: .LBB42_3: ; %endif
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -2004,10 +2004,10 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s5, s[2:3], 0x0
; VI-NEXT: .LBB42_3: ; %endif
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index f372a54894604c..f324010e4cfad0 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -661,9 +661,9 @@ define double @sitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b32_e32 v4, v0
; GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GISEL-NEXT: s_mov_b64 s[4:5], 0
; GISEL-NEXT: v_or_b32_e32 v0, v4, v2
; GISEL-NEXT: v_or_b32_e32 v1, v5, v3
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mov_b32_e32 v1, s5
@@ -946,9 +946,9 @@ define double @uitofp_i128_to_f64(i128 %x) {
; GISEL-LABEL: uitofp_i128_to_f64:
; GISEL: ; %bb.0: ; %itofp-entry
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b64 s[4:5], 0
; GISEL-NEXT: v_or_b32_e32 v4, v0, v2
; GISEL-NEXT: v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GISEL-NEXT: v_mov_b32_e32 v4, s4
; GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1045,8 +1045,8 @@ define double @uitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
; GISEL-NEXT: s_cbranch_execz .LBB3_10
; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
-; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GISEL-NEXT: v_or_b32_e32 v10, v10, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v8
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 8704f4e780448b..7c3878cbff3c5e 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -866,8 +866,8 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_add_u32 s0, s0, 2
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_byte v[2:3], v5
@@ -1002,10 +1002,10 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 4
; VI-NEXT: s_addc_u32 s5, s1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: flat_store_short v[2:3], v4
@@ -1121,9 +1121,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
@@ -1200,9 +1200,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
@@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -1400,8 +1400,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -1474,8 +1474,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -1549,10 +1549,10 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 4
; VI-NEXT: s_addc_u32 s5, s1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: flat_store_byte v[2:3], v4
@@ -1693,13 +1693,13 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16>
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 8
; VI-NEXT: s_addc_u32 s5, s1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[2:3], v4
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -1926,13 +1926,13 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32>
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s6, 16
; VI-NEXT: s_addc_u32 s5, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -2024,15 +2024,15 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s6, 16
; VI-NEXT: s_addc_u32 s5, s7, 0
-; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: flat_store_dword v[1:2], v3
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -2129,19 +2129,19 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64>
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
+; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s12
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: s_add_u32 s4, s8, 16
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s8
@@ -2271,19 +2271,19 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
+; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s12
; VI-NEXT: v_mov_b32_e32 v4, s13
; VI-NEXT: s_add_u32 s4, s8, 16
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s8
@@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -2653,8 +2653,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -2909,10 +2909,10 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -3020,10 +3020,10 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -3124,8 +3124,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -3582,10 +3582,10 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -4547,10 +4547,10 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_add_u32 s0, s0, 8
; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v6, s4
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_byte v[4:5], v6
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4563,8 +4563,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s4, 1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -4821,8 +4821,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -4971,8 +4971,8 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -5096,10 +5096,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -5121,10 +5121,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -5210,8 +5210,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -5240,11 +5241,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_add_u32 s0, s4, 51
-; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v9, v[2:3]
; VI-NEXT: flat_load_ubyte v10, v[4:5]
@@ -5687,8 +5688,8 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_add_u32 s0, s4, 42
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index a0ba97d3b639ca..7814c106ddee7c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -59,8 +59,8 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %s
define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl:
; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
@@ -119,8 +119,8 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv(<16 x float> %src, float %s
define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl:
; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index ec100a9e5b0f8d..83925b578e7138 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -45,8 +45,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0|
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0|
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -101,8 +101,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -114,8 +114,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0|
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -204,8 +204,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -218,8 +218,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -264,8 +264,8 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -278,8 +278,8 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -324,8 +324,8 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -338,8 +338,8 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -384,8 +384,8 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -398,8 +398,8 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -444,8 +444,8 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -458,8 +458,8 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -504,8 +504,8 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -518,8 +518,8 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -564,8 +564,8 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -578,8 +578,8 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -624,8 +624,8 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -638,8 +638,8 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -684,8 +684,8 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -698,8 +698,8 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -744,8 +744,8 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -758,8 +758,8 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -804,8 +804,8 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -818,8 +818,8 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -864,8 +864,8 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -878,8 +878,8 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -924,8 +924,8 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -938,8 +938,8 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -984,8 +984,8 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -998,8 +998,8 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -1861,8 +1861,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -1876,8 +1876,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -1928,8 +1928,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -1943,8 +1943,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2034,8 +2034,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2048,8 +2048,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2095,8 +2095,8 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2109,8 +2109,8 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2156,8 +2156,8 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2170,8 +2170,8 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2217,8 +2217,8 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2231,8 +2231,8 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2278,8 +2278,8 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2292,8 +2292,8 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2339,8 +2339,8 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2353,8 +2353,8 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2414,8 +2414,8 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2461,8 +2461,8 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2475,8 +2475,8 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2522,8 +2522,8 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2536,8 +2536,8 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2583,8 +2583,8 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2597,8 +2597,8 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2644,8 +2644,8 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2658,8 +2658,8 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2704,8 +2704,8 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2718,8 +2718,8 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2764,8 +2764,8 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2778,8 +2778,8 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -2824,8 +2824,8 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
@@ -2838,8 +2838,8 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 13a53f0b96de2d..8278fac5ea61c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -44,8 +44,8 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -71,8 +71,8 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -146,8 +146,8 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -173,8 +173,8 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -206,8 +206,8 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -233,8 +233,8 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -266,8 +266,8 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -293,8 +293,8 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -326,8 +326,8 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -353,8 +353,8 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -386,8 +386,8 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -413,8 +413,8 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -446,8 +446,8 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -473,8 +473,8 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -506,8 +506,8 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -533,8 +533,8 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -566,8 +566,8 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -593,8 +593,8 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -626,8 +626,8 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -653,8 +653,8 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -684,8 +684,8 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -742,8 +742,8 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -800,8 +800,8 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -858,8 +858,8 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -916,8 +916,8 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -974,8 +974,8 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -1032,8 +1032,8 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -1090,8 +1090,8 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -1148,8 +1148,8 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -1206,8 +1206,8 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
@@ -1266,8 +1266,8 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1293,8 +1293,8 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1368,8 +1368,8 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1395,8 +1395,8 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1428,8 +1428,8 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1455,8 +1455,8 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1488,8 +1488,8 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1515,8 +1515,8 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1548,8 +1548,8 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1575,8 +1575,8 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1608,8 +1608,8 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1635,8 +1635,8 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1668,8 +1668,8 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1695,8 +1695,8 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1728,8 +1728,8 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1755,8 +1755,8 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1788,8 +1788,8 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1815,8 +1815,8 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1848,8 +1848,8 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
@@ -1875,8 +1875,8 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
@@ -1912,8 +1912,8 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
index 80f295b939709b..0f1a487d134310 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
@@ -15,8 +15,8 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa
; GCN-NEXT: v_sub_f32_e32 v4, v2, v3
; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2
; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2
+; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index 2fced3240fe358..f7a0437d3afc59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -84,11 +84,11 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GFX6789-LABEL: load_1d_tfe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v6, 0
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v7, v6
; GFX6789-NEXT: v_mov_b32_e32 v8, v6
; GFX6789-NEXT: v_mov_b32_e32 v9, v6
; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v6
; GFX6789-NEXT: v_mov_b32_e32 v1, v7
; GFX6789-NEXT: v_mov_b32_e32 v2, v8
@@ -199,11 +199,11 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GFX6789-LABEL: load_1d_lwe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v6, 0
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v7, v6
; GFX6789-NEXT: v_mov_b32_e32 v8, v6
; GFX6789-NEXT: v_mov_b32_e32 v9, v6
; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v6
; GFX6789-NEXT: v_mov_b32_e32 v1, v7
; GFX6789-NEXT: v_mov_b32_e32 v2, v8
@@ -352,12 +352,12 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GFX6789-LABEL: load_2d_tfe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v7, 0
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v8, v7
; GFX6789-NEXT: v_mov_b32_e32 v9, v7
; GFX6789-NEXT: v_mov_b32_e32 v10, v7
; GFX6789-NEXT: v_mov_b32_e32 v11, v7
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v7
; GFX6789-NEXT: v_mov_b32_e32 v1, v8
; GFX6789-NEXT: v_mov_b32_e32 v2, v9
@@ -515,13 +515,13 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa
; GFX6789-LABEL: load_3d_tfe_lwe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v8, 0
-; GFX6789-NEXT: v_mov_b32_e32 v7, v2
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v9, v8
; GFX6789-NEXT: v_mov_b32_e32 v10, v8
; GFX6789-NEXT: v_mov_b32_e32 v11, v8
; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v7, v2
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v8
; GFX6789-NEXT: v_mov_b32_e32 v1, v9
; GFX6789-NEXT: v_mov_b32_e32 v2, v10
@@ -681,13 +681,13 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace
; GFX6789-LABEL: load_cube_lwe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v8, 0
-; GFX6789-NEXT: v_mov_b32_e32 v7, v2
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v9, v8
; GFX6789-NEXT: v_mov_b32_e32 v10, v8
; GFX6789-NEXT: v_mov_b32_e32 v11, v8
; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v7, v2
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v8
; GFX6789-NEXT: v_mov_b32_e32 v1, v9
; GFX6789-NEXT: v_mov_b32_e32 v2, v10
@@ -839,12 +839,12 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp
; GFX6789-LABEL: load_1darray_tfe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v7, 0
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v8, v7
; GFX6789-NEXT: v_mov_b32_e32 v9, v7
; GFX6789-NEXT: v_mov_b32_e32 v10, v7
; GFX6789-NEXT: v_mov_b32_e32 v11, v7
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v7
; GFX6789-NEXT: v_mov_b32_e32 v1, v8
; GFX6789-NEXT: v_mov_b32_e32 v2, v9
@@ -1002,13 +1002,13 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp
; GFX6789-LABEL: load_2darray_lwe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v8, 0
-; GFX6789-NEXT: v_mov_b32_e32 v7, v2
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v9, v8
; GFX6789-NEXT: v_mov_b32_e32 v10, v8
; GFX6789-NEXT: v_mov_b32_e32 v11, v8
; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v7, v2
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v8
; GFX6789-NEXT: v_mov_b32_e32 v1, v9
; GFX6789-NEXT: v_mov_b32_e32 v2, v10
@@ -1162,13 +1162,13 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp
; GFX6789-LABEL: load_2dmsaa_both:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v8, 0
-; GFX6789-NEXT: v_mov_b32_e32 v7, v2
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v9, v8
; GFX6789-NEXT: v_mov_b32_e32 v10, v8
; GFX6789-NEXT: v_mov_b32_e32 v11, v8
; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v7, v2
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v8
; GFX6789-NEXT: v_mov_b32_e32 v1, v9
; GFX6789-NEXT: v_mov_b32_e32 v2, v10
@@ -1330,14 +1330,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
; GFX6789-LABEL: load_2darraymsaa_tfe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v9, 0
-; GFX6789-NEXT: v_mov_b32_e32 v8, v3
-; GFX6789-NEXT: v_mov_b32_e32 v7, v2
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v10, v9
; GFX6789-NEXT: v_mov_b32_e32 v11, v9
; GFX6789-NEXT: v_mov_b32_e32 v12, v9
; GFX6789-NEXT: v_mov_b32_e32 v13, v9
+; GFX6789-NEXT: v_mov_b32_e32 v8, v3
+; GFX6789-NEXT: v_mov_b32_e32 v7, v2
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v9
; GFX6789-NEXT: v_mov_b32_e32 v1, v10
; GFX6789-NEXT: v_mov_b32_e32 v2, v11
@@ -1497,12 +1497,12 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa
; GFX6789-LABEL: load_mip_1d_lwe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v7, 0
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v8, v7
; GFX6789-NEXT: v_mov_b32_e32 v9, v7
; GFX6789-NEXT: v_mov_b32_e32 v10, v7
; GFX6789-NEXT: v_mov_b32_e32 v11, v7
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v7
; GFX6789-NEXT: v_mov_b32_e32 v1, v8
; GFX6789-NEXT: v_mov_b32_e32 v2, v9
@@ -1654,13 +1654,13 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa
; GFX6789-LABEL: load_mip_2d_tfe:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v8, 0
-; GFX6789-NEXT: v_mov_b32_e32 v7, v2
-; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v9, v8
; GFX6789-NEXT: v_mov_b32_e32 v10, v8
; GFX6789-NEXT: v_mov_b32_e32 v11, v8
; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v7, v2
+; GFX6789-NEXT: v_mov_b32_e32 v6, v1
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v8
; GFX6789-NEXT: v_mov_b32_e32 v1, v9
; GFX6789-NEXT: v_mov_b32_e32 v2, v10
@@ -2133,10 +2133,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a
; GFX6789-LABEL: load_1d_tfe_V4_dmask3:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v5, 0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
; GFX6789-NEXT: v_mov_b32_e32 v6, v5
; GFX6789-NEXT: v_mov_b32_e32 v7, v5
; GFX6789-NEXT: v_mov_b32_e32 v8, v5
+; GFX6789-NEXT: v_mov_b32_e32 v4, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v5
; GFX6789-NEXT: v_mov_b32_e32 v1, v6
; GFX6789-NEXT: v_mov_b32_e32 v2, v7
@@ -2237,9 +2237,9 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a
; GFX6789-LABEL: load_1d_tfe_V4_dmask2:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v4, 0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
; GFX6789-NEXT: v_mov_b32_e32 v5, v4
; GFX6789-NEXT: v_mov_b32_e32 v6, v4
+; GFX6789-NEXT: v_mov_b32_e32 v3, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v4
; GFX6789-NEXT: v_mov_b32_e32 v1, v5
; GFX6789-NEXT: v_mov_b32_e32 v2, v6
@@ -2333,8 +2333,8 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr a
; GFX6789-LABEL: load_1d_tfe_V4_dmask1:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v3, 0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
; GFX6789-NEXT: v_mov_b32_e32 v4, v3
+; GFX6789-NEXT: v_mov_b32_e32 v2, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v3
; GFX6789-NEXT: v_mov_b32_e32 v1, v4
; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe
@@ -2422,8 +2422,8 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr a
; GFX6789-LABEL: load_1d_tfe_V2_dmask1:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: v_mov_b32_e32 v3, 0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
; GFX6789-NEXT: v_mov_b32_e32 v4, v3
+; GFX6789-NEXT: v_mov_b32_e32 v2, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v3
; GFX6789-NEXT: v_mov_b32_e32 v1, v4
; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index c8421c66f97c38..104bce2b8cebf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -74,11 +74,11 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX6789-NEXT: s_mov_b64 s[14:15], exec
; GFX6789-NEXT: s_wqm_b64 exec, exec
; GFX6789-NEXT: v_mov_b32_e32 v6, 0
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v7, v6
; GFX6789-NEXT: v_mov_b32_e32 v8, v6
; GFX6789-NEXT: v_mov_b32_e32 v9, v6
; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v6
; GFX6789-NEXT: v_mov_b32_e32 v1, v7
; GFX6789-NEXT: v_mov_b32_e32 v2, v8
@@ -634,11 +634,11 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX6789-NEXT: s_mov_b64 s[14:15], exec
; GFX6789-NEXT: s_wqm_b64 exec, exec
; GFX6789-NEXT: v_mov_b32_e32 v6, 0
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v7, v6
; GFX6789-NEXT: v_mov_b32_e32 v8, v6
; GFX6789-NEXT: v_mov_b32_e32 v9, v6
; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
; GFX6789-NEXT: v_mov_b32_e32 v0, v6
; GFX6789-NEXT: v_mov_b32_e32 v1, v7
; GFX6789-NEXT: v_mov_b32_e32 v2, v8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
index e0a5d397bded4d..caf40eba3db43e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
@@ -61,11 +61,11 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11]
-; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
+; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13
; DAGISEL12-NEXT: ; %bb.2: ; %tail
@@ -112,8 +112,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL10-NEXT: s_mov_b64 exec, s[10:11]
-; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13
+; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13
; DAGISEL10-NEXT: ; %bb.2: ; %tail
; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 8af5db9f629083..13e556f698ce99 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -114,14 +114,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f
; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16:
; GFX12-GISEL: ; %bb.0: ; %main_body
; GFX12-GISEL-NEXT: s_mov_b32 s20, s2
+; GFX12-GISEL-NEXT: s_mov_b32 s21, s3
; GFX12-GISEL-NEXT: s_mov_b32 s22, s4
; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s7, s5
-; GFX12-GISEL-NEXT: s_mov_b32 s21, s3
; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s7, s5
; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s8, s6
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
; GFX12-GISEL-NEXT: s_mov_b32 s16, s9
; GFX12-GISEL-NEXT: s_mov_b32 s17, s10
@@ -237,13 +237,13 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr,
; GFX12-GISEL: ; %bb.0: ; %main_body
; GFX12-GISEL-NEXT: s_mov_b32 s20, s3
; GFX12-GISEL-NEXT: s_mov_b32 s21, s4
-; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s8, s6
; GFX12-GISEL-NEXT: s_mov_b32 s22, s5
+; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s8, s6
; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s8, s6
; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s9, s7
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6
; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s5
; GFX12-GISEL-NEXT: s_mov_b32 s16, s10
@@ -376,19 +376,20 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000
; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0
; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000
-; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000
; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13
+; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v8, s14
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v5, s10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v7, s13
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
@@ -397,9 +398,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1]
; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3]
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
@@ -528,15 +529,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600
; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700
; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
@@ -545,9 +547,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1]
; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3]
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
@@ -688,29 +690,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0
; GFX12-GISEL-NEXT: s_mov_b32 s4, 0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7
-; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000
; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000
; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0
+; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000
; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000
; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000
+; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, 0xb36211c7
; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1]
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
@@ -845,22 +847,22 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX12-GISEL-NEXT: s_mov_b32 s4, 0
; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600
; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10
; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v3, s8
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1]
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1]
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index 7283ec88a90d83..9afdfca78d726d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -213,8 +213,8 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr
; SDAG-NEXT: ; %bb.1: ; %if
; SDAG-NEXT: s_add_u32 s0, s0, 1
; SDAG-NEXT: s_addc_u32 s1, s1, 0
-; SDAG-NEXT: v_mov_b32_e32 v3, s1
; SDAG-NEXT: v_mov_b32_e32 v2, s0
+; SDAG-NEXT: v_mov_b32_e32 v3, s1
; SDAG-NEXT: ; %bb.2: ; %endif
; SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 8d380516df8b55..a552e981e629ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -22,8 +22,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -95,8 +95,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -257,8 +257,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a31, s23
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
@@ -327,8 +327,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a31, s23
; GCN-NEXT: v_accvgpr_write_b32 a30, s22
; GCN-NEXT: v_accvgpr_write_b32 a29, s21
; GCN-NEXT: v_accvgpr_write_b32 a28, s20
@@ -396,8 +396,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -438,8 +438,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 44cb4e803ffad6..dd84c67b0f6270 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -60,8 +60,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
@@ -80,8 +80,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
@@ -107,8 +107,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
@@ -127,8 +127,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
@@ -159,8 +159,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -226,8 +226,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -243,15 +243,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
@@ -289,8 +289,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -356,8 +356,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -373,15 +373,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
@@ -508,8 +508,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
@@ -572,8 +572,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -593,10 +593,10 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -632,8 +632,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
@@ -696,8 +696,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -717,10 +717,10 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -755,8 +755,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -792,8 +792,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -834,8 +834,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -871,8 +871,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -984,8 +984,8 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
@@ -1035,8 +1035,8 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
@@ -1070,11 +1070,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG-NEXT: v_mov_b32_e32 v1, s25
; SDAG-NEXT: v_mov_b32_e32 v2, s26
; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_mov_b32_e32 v4, s28
; SDAG-NEXT: v_mov_b32_e32 v5, s29
; SDAG-NEXT: v_mov_b32_e32 v6, s30
; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1142,8 +1142,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1159,15 +1159,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
@@ -1206,11 +1206,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG-NEXT: v_mov_b32_e32 v1, s25
; SDAG-NEXT: v_mov_b32_e32 v2, s26
; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_mov_b32_e32 v4, s28
; SDAG-NEXT: v_mov_b32_e32 v5, s29
; SDAG-NEXT: v_mov_b32_e32 v6, s30
; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1278,8 +1278,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1295,15 +1295,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
@@ -1501,8 +1501,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1522,10 +1522,10 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1632,8 +1632,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1653,10 +1653,10 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1733,8 +1733,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1817,8 +1817,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -1906,8 +1906,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
@@ -1932,8 +1932,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 9a8282231ac15a..3c20e406e84d75 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1493,11 +1493,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v20, s28
; SDAG-NEXT: v_mov_b32_e32 v23, v1
; SDAG-NEXT: v_mov_b32_e32 v22, v0
+; SDAG-NEXT: v_mov_b32_e32 v20, s28
; SDAG-NEXT: v_mov_b32_e32 v21, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
; SDAG-NEXT: v_mov_b32_e32 v4, s20
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
@@ -1506,6 +1505,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v9, s25
; SDAG-NEXT: v_mov_b32_e32 v10, s26
; SDAG-NEXT: v_mov_b32_e32 v11, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
; SDAG-NEXT: v_accvgpr_write_b32 a2, v22
; SDAG-NEXT: v_accvgpr_write_b32 a3, v23
@@ -1530,15 +1530,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v20, s28
; GISEL-NEXT: v_mov_b32_e32 v22, v0
; GISEL-NEXT: v_mov_b32_e32 v23, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, s28
; GISEL-NEXT: v_mov_b32_e32 v21, s29
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
; GISEL-NEXT: v_accvgpr_write_b32 a1, v21
; GISEL-NEXT: v_accvgpr_write_b32 a2, v22
; GISEL-NEXT: v_accvgpr_write_b32 a3, v23
@@ -1667,7 +1667,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v14, s0
; SDAG-NEXT: v_mov_b32_e32 v15, s1
; SDAG-NEXT: v_mov_b32_e32 v16, s2
@@ -1676,6 +1675,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
@@ -1697,10 +1697,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
@@ -1817,8 +1817,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
@@ -1857,8 +1857,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_movk_i32 s0, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
@@ -1941,10 +1941,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
@@ -1978,7 +1978,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -1987,6 +1986,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
@@ -2009,10 +2009,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 05f8739e7cb890..e4bb3c555a4670 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3523,7 +3523,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v37, s17
; SDAG-NEXT: v_mov_b32_e32 v38, s18
; SDAG-NEXT: v_mov_b32_e32 v39, s19
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v31, v13
; SDAG-NEXT: v_mov_b32_e32 v30, v12
; SDAG-NEXT: v_mov_b32_e32 v29, v11
@@ -3538,8 +3537,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
@@ -3548,6 +3547,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v5, s25
; SDAG-NEXT: v_mov_b32_e32 v6, s26
; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
@@ -3605,19 +3605,19 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: v_mov_b32_e32 v26, v8
; GISEL-NEXT: v_mov_b32_e32 v27, v9
; GISEL-NEXT: v_mov_b32_e32 v28, v10
; GISEL-NEXT: v_mov_b32_e32 v29, v11
; GISEL-NEXT: v_mov_b32_e32 v30, v12
; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: v_mov_b32_e32 v17, s29
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
@@ -3871,7 +3871,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
@@ -3880,6 +3879,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
@@ -3926,10 +3926,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
@@ -4075,13 +4075,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v37, s17
; SDAG-NEXT: v_mov_b32_e32 v38, s18
; SDAG-NEXT: v_mov_b32_e32 v39, s19
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v31, v13
; SDAG-NEXT: v_mov_b32_e32 v30, v12
; SDAG-NEXT: v_mov_b32_e32 v29, v11
; SDAG-NEXT: v_mov_b32_e32 v28, v10
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
; SDAG-NEXT: v_mov_b32_e32 v19, s23
@@ -4141,13 +4141,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v16, s20
; GISEL-NEXT: v_mov_b32_e32 v26, v8
; GISEL-NEXT: v_mov_b32_e32 v27, v9
; GISEL-NEXT: v_mov_b32_e32 v28, v10
; GISEL-NEXT: v_mov_b32_e32 v29, v11
; GISEL-NEXT: v_mov_b32_e32 v30, v12
; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v16, s20
; GISEL-NEXT: v_mov_b32_e32 v17, s21
; GISEL-NEXT: v_mov_b32_e32 v18, s22
; GISEL-NEXT: v_mov_b32_e32 v19, s23
@@ -4495,10 +4495,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
@@ -4547,7 +4547,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -4556,6 +4555,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
@@ -4595,10 +4595,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
@@ -4652,9 +4652,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4725,10 +4725,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4748,14 +4748,14 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -4874,10 +4874,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4899,10 +4899,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -5021,10 +5021,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
; GISEL-NEXT: v_accvgpr_write_b32 a30, s22
; GISEL-NEXT: v_accvgpr_write_b32 a29, s21
; GISEL-NEXT: v_accvgpr_write_b32 a28, s20
@@ -5046,10 +5046,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -5168,10 +5168,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -5193,10 +5193,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
index dbe95a8091932a..e36b2181bf5c08 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
@@ -124,8 +124,8 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) {
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-SDAG-NEXT: s_mov_b32 s6, src_pops_exiting_wave_id
-; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], 36
+; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
; GFX9-SDAG-NEXT: s_mov_b32 s32, 0
@@ -155,31 +155,51 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) {
; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
-; GFX10-LABEL: test_call:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX10-NEXT: s_mov_b32 s38, -1
-; GFX10-NEXT: s_mov_b32 s39, 0x31c16000
-; GFX10-NEXT: s_add_u32 s36, s36, s2
-; GFX10-NEXT: s_addc_u32 s37, s37, 0
-; GFX10-NEXT: s_getpc_b64 s[0:1]
-; GFX10-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
-; GFX10-NEXT: s_mov_b64 s[8:9], 36
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: s_endpgm
+; GFX10-SDAG-LABEL: test_call:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX10-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX10-SDAG-NEXT: s_mov_b32 s38, -1
+; GFX10-SDAG-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX10-SDAG-NEXT: s_add_u32 s36, s36, s2
+; GFX10-SDAG-NEXT: s_addc_u32 s37, s37, 0
+; GFX10-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX10-SDAG-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
+; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
+; GFX10-SDAG-NEXT: s_mov_b64 s[8:9], 36
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX10-SDAG-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX10-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX10-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: test_call:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX10-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX10-GISEL-NEXT: s_mov_b32 s38, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX10-GISEL-NEXT: s_add_u32 s36, s36, s2
+; GFX10-GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GFX10-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-GISEL-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
+; GFX10-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX10-GISEL-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX10-GISEL-NEXT: s_mov_b64 s[8:9], 36
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX10-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-GISEL-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.pops.exiting.wave.id()
call void @foo(i32 %id)
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 63b139bb25e775..5907dcbff8663a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -185,9 +185,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s6, s6, s6
-; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-GISEL-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index edb6ebcee13255..39f16bf84171d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -291,8 +291,8 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1
; CHECK-GISEL-NEXT: s_nop 3
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -339,12 +339,12 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
; CHECK-GISEL-NEXT: s_nop 3
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -393,12 +393,12 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
; CHECK-GISEL-NEXT: s_nop 3
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -491,9 +491,9 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -507,9 +507,9 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -529,9 +529,9 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -545,9 +545,9 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f72f1e52d135fe..74ce23aca60d11 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -295,9 +295,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 4551c60770bdf5..b62a604479b282 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -296,9 +296,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
index 46359f7e990599..37ba7661dded8d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
@@ -12,42 +12,39 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40
; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
+; GCN-NEXT: ds_load_b128 v[0:3], v32
; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16
; GCN-NEXT: ds_load_b128 v[12:15], v32 offset:2064
-; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160
-; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304
-; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496
-; GCN-NEXT: ds_load_b128 v[0:3], v32
; GCN-NEXT: ds_load_b128 v[8:11], v32 offset:2048
+; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160
; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144
+; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304
; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288
+; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496
; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(4)
-; GCN-NEXT: v_mov_b32_e32 v47, v7
-; GCN-NEXT: s_waitcnt lgkmcnt(3)
-; GCN-NEXT: v_mov_b32_e32 v55, v15
-; GCN-NEXT: s_waitcnt lgkmcnt(2)
-; GCN-NEXT: v_mov_b32_e32 v63, v23
-; GCN-NEXT: s_waitcnt lgkmcnt(1)
-; GCN-NEXT: v_mov_b32_e32 v71, v31
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_mov_b32_e32 v43, v3
+; GCN-NEXT: v_dual_mov_b32 v47, v7 :: v_dual_mov_b32 v46, v6
; GCN-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4
-; GCN-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2
-; GCN-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
+; GCN-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v41, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(6)
+; GCN-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v55, v15
; GCN-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13
; GCN-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11
; GCN-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9
-; GCN-NEXT: v_mov_b32_e32 v48, v8
+; GCN-NEXT: s_waitcnt lgkmcnt(4)
+; GCN-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v63, v23
; GCN-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21
; GCN-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19
; GCN-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17
-; GCN-NEXT: v_mov_b32_e32 v56, v16
+; GCN-NEXT: s_waitcnt lgkmcnt(2)
+; GCN-NEXT: v_dual_mov_b32 v56, v16 :: v_dual_mov_b32 v71, v31
; GCN-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29
; GCN-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27
; GCN-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25
-; GCN-NEXT: v_mov_b32_e32 v64, v24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_dual_mov_b32 v64, v24 :: v_dual_mov_b32 v79, v39
; GCN-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37
; GCN-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35
; GCN-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33
@@ -80,42 +77,39 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40
+; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32
; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16
; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v32 offset:2064
-; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160
-; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304
-; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496
-; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32
; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v32 offset:2048
+; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160
; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144
+; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304
; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288
+; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496
; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4)
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v47, v7
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3)
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v55, v15
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2)
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v63, v23
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1)
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v71, v31
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v3
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v7 :: v_dual_mov_b32 v46, v6
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v41, v1
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(6)
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v55, v15
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, v8
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4)
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v63, v23
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v56, v16
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2)
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v56, v16 :: v_dual_mov_b32 v71, v31
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v64, v24
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v64, v24 :: v_dual_mov_b32 v79, v39
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33
@@ -184,14 +178,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16
; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16
-; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16
; GCN-NEXT: ds_load_b128 v[0:3], v17
+; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v11, v3
; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GCN-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v9, v1
+; GCN-NEXT: v_mov_b32_e32 v8, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
@@ -266,14 +261,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16
-; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16
; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17
+; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v11, v3
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v9, v1
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, v0
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
index dcc3e0df0c7443..703661e22b495d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
@@ -24,24 +24,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr
; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
; GCN-NEXT: s_wait_dscnt 0x6
-; GCN-NEXT: v_mov_b32_e32 v31, v11
+; GCN-NEXT: v_dual_mov_b32 v31, v11 :: v_dual_mov_b32 v30, v10
+; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
; GCN-NEXT: s_wait_dscnt 0x5
-; GCN-NEXT: v_mov_b32_e32 v35, v15
+; GCN-NEXT: v_dual_mov_b32 v35, v15 :: v_dual_mov_b32 v34, v14
+; GCN-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v32, v12
; GCN-NEXT: s_wait_dscnt 0x4
-; GCN-NEXT: v_mov_b32_e32 v39, v19
+; GCN-NEXT: v_dual_mov_b32 v39, v19 :: v_dual_mov_b32 v38, v18
+; GCN-NEXT: v_dual_mov_b32 v37, v17 :: v_dual_mov_b32 v36, v16
; GCN-NEXT: s_wait_dscnt 0x3
-; GCN-NEXT: v_mov_b32_e32 v43, v23
+; GCN-NEXT: v_dual_mov_b32 v43, v23 :: v_dual_mov_b32 v42, v22
+; GCN-NEXT: v_dual_mov_b32 v41, v21 :: v_dual_mov_b32 v40, v20
; GCN-NEXT: s_wait_dscnt 0x2
-; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
-; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
-; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
-; GCN-NEXT: v_mov_b32_e32 v32, v12
-; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
-; GCN-NEXT: v_mov_b32_e32 v36, v16
-; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
-; GCN-NEXT: v_mov_b32_e32 v40, v20
-; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
-; GCN-NEXT: v_mov_b32_e32 v44, v24
+; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v46, v26
+; GCN-NEXT: v_dual_mov_b32 v45, v25 :: v_dual_mov_b32 v44, v24
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
@@ -76,24 +72,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr
; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v31, v11 :: v_dual_mov_b32 v30, v10
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v35, v15 :: v_dual_mov_b32 v34, v14
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v32, v12
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v39, v19 :: v_dual_mov_b32 v38, v18
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v37, v17 :: v_dual_mov_b32 v36, v16
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v23 :: v_dual_mov_b32 v42, v22
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v21 :: v_dual_mov_b32 v40, v20
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v46, v26
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v25 :: v_dual_mov_b32 v44, v24
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index 9a001e0b803941..ad6eaa73ffda90 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -49,30 +49,50 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_get_tma:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_get_tma:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_get_tma:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130)
store i64 %ret, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_get_realtime:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_get_realtime:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_get_realtime:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131)
store i64 %ret, ptr addrspace(1) %out
ret void
@@ -102,15 +122,25 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_get_tba:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_get_tba:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_get_tba:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133)
store i64 %ret, ptr addrspace(1) %out
ret void
@@ -140,15 +170,25 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
}
define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
-; GFX11-LABEL: test_get_99999_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: test_get_99999_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_get_99999_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999)
store i64 %ret, ptr addrspace(1) %out
ret void
@@ -156,3 +196,5 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
declare i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32)
declare i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 66c02a9bd0c6a5..b38df2b5820949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -200,10 +200,10 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
@@ -518,7 +518,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; SDAG-NEXT: v_mov_b32_e32 v29, s1
; SDAG-NEXT: v_mov_b32_e32 v30, s2
; SDAG-NEXT: v_mov_b32_e32 v31, s3
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
; SDAG-NEXT: v_mov_b32_e32 v25, v7
@@ -529,12 +528,12 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
@@ -543,6 +542,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
@@ -585,8 +585,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, s24
-; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v24, v0
; GISEL-NEXT: v_mov_b32_e32 v25, v1
; GISEL-NEXT: v_mov_b32_e32 v26, v2
@@ -597,16 +595,18 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; GISEL-NEXT: v_mov_b32_e32 v31, v7
; GISEL-NEXT: v_mov_b32_e32 v32, v8
; GISEL-NEXT: v_mov_b32_e32 v33, v9
-; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v20, s26
; GISEL-NEXT: v_mov_b32_e32 v21, s27
; GISEL-NEXT: v_mov_b32_e32 v22, s28
; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
@@ -946,7 +946,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-NEXT: v_mov_b32_e32 v29, s1
; GCN-NEXT: v_mov_b32_e32 v30, s2
; GCN-NEXT: v_mov_b32_e32 v31, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s24
; GCN-NEXT: v_mov_b32_e32 v27, v9
; GCN-NEXT: v_mov_b32_e32 v26, v8
; GCN-NEXT: v_mov_b32_e32 v25, v7
@@ -957,12 +956,12 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-NEXT: v_mov_b32_e32 v20, v2
; GCN-NEXT: v_mov_b32_e32 v19, v1
; GCN-NEXT: v_mov_b32_e32 v18, v0
+; GCN-NEXT: v_mov_b32_e32 v12, s24
; GCN-NEXT: v_mov_b32_e32 v13, s25
; GCN-NEXT: v_mov_b32_e32 v14, s26
; GCN-NEXT: v_mov_b32_e32 v15, s27
; GCN-NEXT: v_mov_b32_e32 v16, s28
; GCN-NEXT: v_mov_b32_e32 v17, s29
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: v_mov_b32_e32 v2, s18
@@ -971,6 +970,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: v_mov_b32_e32 v6, s22
; GCN-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
; GCN-NEXT: v_accvgpr_write_b32 a1, v13
; GCN-NEXT: v_accvgpr_write_b32 a2, v14
; GCN-NEXT: v_accvgpr_write_b32 a3, v15
@@ -1213,10 +1213,10 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
@@ -1288,8 +1288,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
@@ -1537,7 +1537,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-NEXT: v_mov_b32_e32 v29, s1
; SDAG-NEXT: v_mov_b32_e32 v30, s2
; SDAG-NEXT: v_mov_b32_e32 v31, s3
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
; SDAG-NEXT: v_mov_b32_e32 v25, v7
@@ -1548,12 +1547,12 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
@@ -1562,6 +1561,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
@@ -1604,8 +1604,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, s24
-; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v24, v0
; GISEL-NEXT: v_mov_b32_e32 v25, v1
; GISEL-NEXT: v_mov_b32_e32 v26, v2
@@ -1616,16 +1614,18 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-NEXT: v_mov_b32_e32 v31, v7
; GISEL-NEXT: v_mov_b32_e32 v32, v8
; GISEL-NEXT: v_mov_b32_e32 v33, v9
-; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v20, s26
; GISEL-NEXT: v_mov_b32_e32 v21, s27
; GISEL-NEXT: v_mov_b32_e32 v22, s28
; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
@@ -1842,10 +1842,10 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
@@ -2057,10 +2057,10 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
@@ -2272,10 +2272,10 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
@@ -2487,10 +2487,10 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
@@ -2562,8 +2562,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
@@ -2811,7 +2811,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v29, s1
; SDAG-NEXT: v_mov_b32_e32 v30, s2
; SDAG-NEXT: v_mov_b32_e32 v31, s3
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
; SDAG-NEXT: v_mov_b32_e32 v25, v7
@@ -2822,12 +2821,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
@@ -2836,6 +2835,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
@@ -2878,8 +2878,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, s24
-; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v24, v0
; GISEL-NEXT: v_mov_b32_e32 v25, v1
; GISEL-NEXT: v_mov_b32_e32 v26, v2
@@ -2890,16 +2888,18 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v31, v7
; GISEL-NEXT: v_mov_b32_e32 v32, v8
; GISEL-NEXT: v_mov_b32_e32 v33, v9
-; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v20, s26
; GISEL-NEXT: v_mov_b32_e32 v21, s27
; GISEL-NEXT: v_mov_b32_e32 v22, s28
; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
@@ -2976,8 +2976,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
@@ -3225,7 +3225,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v29, s1
; SDAG-NEXT: v_mov_b32_e32 v30, s2
; SDAG-NEXT: v_mov_b32_e32 v31, s3
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
; SDAG-NEXT: v_mov_b32_e32 v25, v7
@@ -3236,12 +3235,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
@@ -3250,6 +3249,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
@@ -3292,8 +3292,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, s24
-; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v24, v0
; GISEL-NEXT: v_mov_b32_e32 v25, v1
; GISEL-NEXT: v_mov_b32_e32 v26, v2
@@ -3304,16 +3302,18 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v31, v7
; GISEL-NEXT: v_mov_b32_e32 v32, v8
; GISEL-NEXT: v_mov_b32_e32 v33, v9
-; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v20, s26
; GISEL-NEXT: v_mov_b32_e32 v21, s27
; GISEL-NEXT: v_mov_b32_e32 v22, s28
; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
@@ -3390,8 +3390,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
@@ -3639,7 +3639,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v29, s1
; SDAG-NEXT: v_mov_b32_e32 v30, s2
; SDAG-NEXT: v_mov_b32_e32 v31, s3
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
; SDAG-NEXT: v_mov_b32_e32 v25, v7
@@ -3650,12 +3649,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
@@ -3664,6 +3663,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
@@ -3706,8 +3706,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, s24
-; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v24, v0
; GISEL-NEXT: v_mov_b32_e32 v25, v1
; GISEL-NEXT: v_mov_b32_e32 v26, v2
@@ -3718,16 +3716,18 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v31, v7
; GISEL-NEXT: v_mov_b32_e32 v32, v8
; GISEL-NEXT: v_mov_b32_e32 v33, v9
-; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v20, s26
; GISEL-NEXT: v_mov_b32_e32 v21, s27
; GISEL-NEXT: v_mov_b32_e32 v22, s28
; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
@@ -3804,8 +3804,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
@@ -4053,7 +4053,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v29, s1
; SDAG-NEXT: v_mov_b32_e32 v30, s2
; SDAG-NEXT: v_mov_b32_e32 v31, s3
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v27, v9
; SDAG-NEXT: v_mov_b32_e32 v26, v8
; SDAG-NEXT: v_mov_b32_e32 v25, v7
@@ -4064,12 +4063,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v20, v2
; SDAG-NEXT: v_mov_b32_e32 v19, v1
; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
@@ -4078,6 +4077,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-NEXT: v_mov_b32_e32 v5, s21
; SDAG-NEXT: v_mov_b32_e32 v6, s22
; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
@@ -4120,8 +4120,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, s24
-; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v24, v0
; GISEL-NEXT: v_mov_b32_e32 v25, v1
; GISEL-NEXT: v_mov_b32_e32 v26, v2
@@ -4132,16 +4130,18 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v31, v7
; GISEL-NEXT: v_mov_b32_e32 v32, v8
; GISEL-NEXT: v_mov_b32_e32 v33, v9
-; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
; GISEL-NEXT: v_mov_b32_e32 v20, s26
; GISEL-NEXT: v_mov_b32_e32 v21, s27
; GISEL-NEXT: v_mov_b32_e32 v22, s28
; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 17ebb1a835462d..d5781f2772d6a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -205,9 +205,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s8, s8, s8
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s10
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s10
; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
; GFX12-PACKED-GISEL-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 7342c366799e9c..1d827b0cc1a2b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -617,14 +617,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v0, v[0:1]
; GFX802-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v0
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT: s_nop 2
+; GFX802-SDAG-NEXT: s_nop 3
; GFX802-SDAG-NEXT: v_writelane_b32 v2, 12, s2
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
;
@@ -677,14 +677,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-GISEL-NEXT: flat_load_dword v0, v[0:1]
; GFX802-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT: s_nop 2
+; GFX802-GISEL-NEXT: s_nop 3
; GFX802-GISEL-NEXT: v_writelane_b32 v2, 12, s2
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
;
@@ -746,13 +746,13 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT: s_nop 2
+; GFX802-SDAG-NEXT: s_nop 3
; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s2
; GFX802-SDAG-NEXT: v_writelane_b32 v0, 12, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
@@ -886,14 +886,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX802-SDAG-NEXT: v_readfirstlane_b32 m0, v2
; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT: s_nop 1
+; GFX802-SDAG-NEXT: s_nop 2
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2
@@ -1415,8 +1415,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
@@ -1452,8 +1452,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
@@ -1542,9 +1542,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX802-GISEL-NEXT: s_endpgm
@@ -1639,9 +1639,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX802-GISEL-NEXT: s_endpgm
@@ -1687,8 +1687,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
@@ -1719,8 +1719,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
@@ -1801,9 +1801,9 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX802-GISEL-NEXT: s_endpgm
@@ -1892,9 +1892,9 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX802-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index bbade6e7469f7e..b7b4fb32d3c5b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -382,8 +382,8 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
@@ -434,8 +434,8 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -917,8 +917,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
@@ -989,8 +989,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
@@ -1675,8 +1675,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
@@ -1766,8 +1766,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 81bb556b8c87bc..db3cb1b3acf503 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -384,8 +384,8 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
@@ -436,8 +436,8 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
@@ -919,8 +919,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
@@ -991,8 +991,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
@@ -1677,8 +1677,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
@@ -1768,8 +1768,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 333d428c84bcca..b2e8f0563c532f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -61,9 +61,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: v_ldexp_f32 v2, v0, s2
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
@@ -241,8 +241,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3
; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
@@ -466,18 +466,18 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3
; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4
; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4
+; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0
; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
@@ -766,17 +766,17 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4
; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2
; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
+; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index 480d978fa530b4..63fd937347190e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -34,8 +34,8 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
; GFX8CHECK-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX8CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80
; GFX8CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 1d869559d9e772..f39c882bba7974 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -47,8 +47,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index d411601d9eabd2..744337763e5caa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -41,8 +41,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 8b3b79b0b1bdd7..9a92b686cdbb03 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -274,17 +274,17 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
; VI-SDAG-NEXT: v_log_f32_e32 v4, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
@@ -561,21 +561,21 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3
-; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
; VI-SDAG-NEXT: v_ldexp_f32 v5, s1, v5
-; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
; VI-SDAG-NEXT: v_log_f32_e32 v5, v5
-; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1
; VI-SDAG-NEXT: v_log_f32_e32 v6, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v5, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
@@ -939,25 +939,25 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc
; VI-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3
-; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT: v_ldexp_f32 v5, s2, v5
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_log_f32_e32 v5, v5
+; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT: v_ldexp_f32 v5, s2, v5
; VI-SDAG-NEXT: v_ldexp_f32 v7, s1, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v5
; VI-SDAG-NEXT: v_log_f32_e32 v7, v7
; VI-SDAG-NEXT: v_log_f32_e32 v8, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v8, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4
; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index e354ec6fb3dd78..a7af30d9ff73d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -820,8 +820,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s22
-; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: v_mov_b32_e32 v1, s23
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: v_mov_b32_e32 v5, s21
; GFX7-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
@@ -841,8 +841,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s23
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s21
; GFX8-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
@@ -862,8 +862,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s22
-; GFX900-NEXT: v_mov_b32_e32 v4, s20
; GFX900-NEXT: v_mov_b32_e32 v1, s23
+; GFX900-NEXT: v_mov_b32_e32 v4, s20
; GFX900-NEXT: v_mov_b32_e32 v5, s21
; GFX900-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 71fdd691a15122..30c6b90abb1e01 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -820,8 +820,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s22
-; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: v_mov_b32_e32 v1, s23
+; GFX7-NEXT: v_mov_b32_e32 v4, s20
; GFX7-NEXT: v_mov_b32_e32 v5, s21
; GFX7-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
@@ -841,8 +841,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s23
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s21
; GFX8-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
@@ -862,8 +862,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s22
-; GFX900-NEXT: v_mov_b32_e32 v4, s20
; GFX900-NEXT: v_mov_b32_e32 v1, s23
+; GFX900-NEXT: v_mov_b32_e32 v4, s20
; GFX900-NEXT: v_mov_b32_e32 v5, s21
; GFX900-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index b378d69fb842ff..a88b3ea7cea284 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -19,8 +19,8 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 {
; SI-NEXT: s_cselect_b32 s4, 0, s4
; SI-NEXT: s_cselect_b32 s5, s8, s5
; SI-NEXT: s_cmp_gt_i32 s7, 51
-; SI-NEXT: s_cselect_b32 s8, s2, s4
; SI-NEXT: s_cselect_b32 s9, s3, s5
+; SI-NEXT: s_cselect_b32 s8, s2, s4
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1]
@@ -155,8 +155,8 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in)
; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: s_cselect_b32 s1, s3, s1
; SI-NEXT: s_cmp_gt_i32 s12, 51
-; SI-NEXT: s_cselect_b32 s12, s10, s0
; SI-NEXT: s_cselect_b32 s13, s11, s1
+; SI-NEXT: s_cselect_b32 s12, s10, s0
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1]
@@ -175,8 +175,8 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in)
; SI-NEXT: s_cselect_b32 s4, 0, s4
; SI-NEXT: s_cselect_b32 s5, s6, s5
; SI-NEXT: s_cmp_gt_i32 s3, 51
-; SI-NEXT: s_cselect_b32 s4, s8, s4
; SI-NEXT: s_cselect_b32 s5, s9, s5
+; SI-NEXT: s_cselect_b32 s4, s8, s4
; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3]
@@ -248,8 +248,8 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in)
; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: s_cselect_b32 s1, s3, s1
; SI-NEXT: s_cmp_gt_i32 s16, 51
-; SI-NEXT: s_cselect_b32 s16, s10, s0
; SI-NEXT: s_cselect_b32 s17, s11, s1
+; SI-NEXT: s_cselect_b32 s16, s10, s0
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1]
@@ -269,9 +269,9 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in)
; SI-NEXT: s_cselect_b32 s5, s10, s5
; SI-NEXT: s_cmp_gt_i32 s3, 51
; SI-NEXT: s_brev_b32 s18, -2
+; SI-NEXT: s_cselect_b32 s5, s9, s5
; SI-NEXT: s_cselect_b32 s4, s8, s4
; SI-NEXT: v_bfi_b32 v5, s18, v0, v1
-; SI-NEXT: s_cselect_b32 s5, s9, s5
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1]
@@ -290,8 +290,8 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in)
; SI-NEXT: s_cselect_b32 s8, 0, s8
; SI-NEXT: s_cselect_b32 s9, s10, s9
; SI-NEXT: s_cmp_gt_i32 s3, 51
-; SI-NEXT: s_cselect_b32 s8, s14, s8
; SI-NEXT: s_cselect_b32 s9, s15, s9
+; SI-NEXT: s_cselect_b32 s8, s14, s8
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1]
@@ -402,8 +402,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: s_cselect_b32 s1, s3, s1
; SI-NEXT: s_cmp_gt_i32 s24, 51
-; SI-NEXT: s_cselect_b32 s24, s10, s0
; SI-NEXT: s_cselect_b32 s25, s11, s1
+; SI-NEXT: s_cselect_b32 s24, s10, s0
; SI-NEXT: v_mov_b32_e32 v0, s24
; SI-NEXT: v_mov_b32_e32 v1, s25
; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1]
@@ -423,9 +423,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; SI-NEXT: s_cselect_b32 s5, s11, s5
; SI-NEXT: s_cmp_gt_i32 s10, 51
; SI-NEXT: s_brev_b32 s3, -2
+; SI-NEXT: s_cselect_b32 s5, s9, s5
; SI-NEXT: s_cselect_b32 s4, s8, s4
; SI-NEXT: v_bfi_b32 v9, s3, v0, v1
-; SI-NEXT: s_cselect_b32 s5, s9, s5
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1]
@@ -444,8 +444,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; SI-NEXT: s_cselect_b32 s8, 0, s8
; SI-NEXT: s_cselect_b32 s9, s11, s9
; SI-NEXT: s_cmp_gt_i32 s10, 51
-; SI-NEXT: s_cselect_b32 s8, s14, s8
; SI-NEXT: s_cselect_b32 s9, s15, s9
+; SI-NEXT: s_cselect_b32 s8, s14, s8
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1]
@@ -464,8 +464,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; SI-NEXT: s_cselect_b32 s4, 0, s4
; SI-NEXT: s_cselect_b32 s5, s11, s5
; SI-NEXT: s_cmp_gt_i32 s10, 51
-; SI-NEXT: s_cselect_b32 s4, s12, s4
; SI-NEXT: s_cselect_b32 s5, s13, s5
+; SI-NEXT: s_cselect_b32 s4, s12, s4
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_mov_b32_e32 v5, s5
; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5]
@@ -485,8 +485,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
; SI-NEXT: s_cselect_b32 s8, 0, s8
; SI-NEXT: s_cselect_b32 s9, s11, s9
; SI-NEXT: s_cmp_gt_i32 s10, 51
-; SI-NEXT: s_cselect_b32 s8, s18, s8
; SI-NEXT: s_cselect_b32 s9, s19, s9
+; SI-NEXT: s_cselect_b32 s8, s18, s8
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 6f95364ac36447..17d5f442374812 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -50,9 +50,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%ld = load double, ptr addrspace(4) %in
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 1fc7349882ba13..1a7552f5e314d0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -536,9 +536,9 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
@@ -1329,13 +1329,13 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10002
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10004
; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 7, v0
; GFX8-NEXT: v_bfe_u32 v2, v0, 6, 1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s4
@@ -1446,8 +1446,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v3, v4, 3, 1
@@ -1600,27 +1600,27 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v15, s3
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s8
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s13
; GFX8-NEXT: v_mov_b32_e32 v18, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s13
+; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: v_mov_b32_e32 v4, s17
; GFX8-NEXT: v_mov_b32_e32 v7, s12
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v11, s5
; GFX8-NEXT: v_mov_b32_e32 v8, s16
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: v_mov_b32_e32 v10, s15
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v12, s9
; GFX8-NEXT: v_mov_b32_e32 v13, s4
@@ -1778,13 +1778,13 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_mov_b32_e32 v8, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_mov_b32_e32 v12, s0
; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v15, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2934,93 +2934,93 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v0, s27
; GFX8-NEXT: s_addc_u32 s27, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s26
-; GFX8-NEXT: v_mov_b32_e32 v1, s44
-; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: v_mov_b32_e32 v5, s27
; GFX8-NEXT: s_add_u32 s26, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s44
+; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v0, s66
; GFX8-NEXT: v_mov_b32_e32 v1, s42
; GFX8-NEXT: v_mov_b32_e32 v2, s65
; GFX8-NEXT: v_mov_b32_e32 v3, s41
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xd0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v0, s64
; GFX8-NEXT: v_mov_b32_e32 v1, s63
; GFX8-NEXT: v_mov_b32_e32 v2, s62
; GFX8-NEXT: v_mov_b32_e32 v3, s40
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xc0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v0, s61
; GFX8-NEXT: v_mov_b32_e32 v1, s39
; GFX8-NEXT: v_mov_b32_e32 v2, s60
; GFX8-NEXT: v_mov_b32_e32 v3, s38
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xb0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v0, s59
; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s58
; GFX8-NEXT: v_mov_b32_e32 v3, s36
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xa0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x90
; GFX8-NEXT: v_mov_b32_e32 v0, s57
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_mov_b32_e32 v2, s56
; GFX8-NEXT: v_mov_b32_e32 v3, s34
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x90
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x80
; GFX8-NEXT: v_mov_b32_e32 v0, s55
; GFX8-NEXT: v_mov_b32_e32 v1, s33
; GFX8-NEXT: v_mov_b32_e32 v2, s54
; GFX8-NEXT: v_mov_b32_e32 v3, s31
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x80
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x70
; GFX8-NEXT: v_mov_b32_e32 v0, s52
; GFX8-NEXT: v_mov_b32_e32 v1, s30
; GFX8-NEXT: v_mov_b32_e32 v2, s53
; GFX8-NEXT: v_mov_b32_e32 v3, s29
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x70
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x60
; GFX8-NEXT: v_mov_b32_e32 v0, s51
; GFX8-NEXT: v_mov_b32_e32 v1, s28
; GFX8-NEXT: v_mov_b32_e32 v2, s50
; GFX8-NEXT: v_mov_b32_e32 v3, s25
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x60
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x50
; GFX8-NEXT: v_mov_b32_e32 v0, s49
; GFX8-NEXT: v_mov_b32_e32 v1, s23
; GFX8-NEXT: v_mov_b32_e32 v2, s48
; GFX8-NEXT: v_mov_b32_e32 v3, s21
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x50
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NEXT: v_mov_b32_e32 v0, s47
; GFX8-NEXT: v_mov_b32_e32 v1, s46
@@ -3032,30 +3032,30 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NEXT: s_add_u32 s22, s0, 64
; GFX8-NEXT: s_addc_u32 s23, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v5, s23
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: s_add_u32 s18, s0, 48
; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v5, s19
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: s_add_u32 s14, s0, 32
; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NEXT: v_mov_b32_e32 v5, s15
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -3063,9 +3063,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s4, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v2, s10
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -3635,84 +3635,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v0, s27
; GFX8-NEXT: s_addc_u32 s27, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s26
-; GFX8-NEXT: v_mov_b32_e32 v1, s66
-; GFX8-NEXT: v_mov_b32_e32 v2, s65
; GFX8-NEXT: v_mov_b32_e32 v5, s27
; GFX8-NEXT: s_add_u32 s26, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s66
+; GFX8-NEXT: v_mov_b32_e32 v2, s65
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v0, s64
; GFX8-NEXT: v_mov_b32_e32 v1, s63
; GFX8-NEXT: v_mov_b32_e32 v2, s62
; GFX8-NEXT: v_mov_b32_e32 v3, s61
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xd0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v0, s60
; GFX8-NEXT: v_mov_b32_e32 v1, s59
; GFX8-NEXT: v_mov_b32_e32 v2, s58
; GFX8-NEXT: v_mov_b32_e32 v3, s57
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xc0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v0, s56
; GFX8-NEXT: v_mov_b32_e32 v1, s55
; GFX8-NEXT: v_mov_b32_e32 v2, s54
; GFX8-NEXT: v_mov_b32_e32 v3, s53
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xb0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v0, s52
; GFX8-NEXT: v_mov_b32_e32 v1, s51
; GFX8-NEXT: v_mov_b32_e32 v2, s50
; GFX8-NEXT: v_mov_b32_e32 v3, s49
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0xa0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x90
; GFX8-NEXT: v_mov_b32_e32 v0, s48
; GFX8-NEXT: v_mov_b32_e32 v1, s47
; GFX8-NEXT: v_mov_b32_e32 v2, s46
; GFX8-NEXT: v_mov_b32_e32 v3, s45
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x90
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x80
; GFX8-NEXT: v_mov_b32_e32 v0, s44
; GFX8-NEXT: v_mov_b32_e32 v1, s43
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_mov_b32_e32 v3, s41
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x80
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x70
; GFX8-NEXT: v_mov_b32_e32 v0, s40
; GFX8-NEXT: v_mov_b32_e32 v1, s39
; GFX8-NEXT: v_mov_b32_e32 v2, s38
; GFX8-NEXT: v_mov_b32_e32 v3, s37
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x70
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
+; GFX8-NEXT: v_mov_b32_e32 v5, s27
+; GFX8-NEXT: s_add_u32 s26, s0, 0x60
; GFX8-NEXT: v_mov_b32_e32 v0, s36
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s33
-; GFX8-NEXT: v_mov_b32_e32 v5, s27
-; GFX8-NEXT: s_add_u32 s26, s0, 0x60
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s27, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NEXT: v_mov_b32_e32 v0, s31
; GFX8-NEXT: v_mov_b32_e32 v1, s30
@@ -3725,9 +3725,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s22, s0, 0x50
; GFX8-NEXT: v_mov_b32_e32 v2, s23
; GFX8-NEXT: s_addc_u32 s23, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v0, s25
; GFX8-NEXT: v_mov_b32_e32 v1, s24
+; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v5, s23
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -3735,9 +3735,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s18, s0, 64
; GFX8-NEXT: v_mov_b32_e32 v2, s19
; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v1, s20
+; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v5, s19
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -3745,9 +3745,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s14, s0, 48
; GFX8-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NEXT: v_mov_b32_e32 v0, s17
; GFX8-NEXT: v_mov_b32_e32 v1, s16
+; GFX8-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NEXT: v_mov_b32_e32 v5, s15
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -3755,9 +3755,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s10, s0, 32
; GFX8-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: v_mov_b32_e32 v0, s13
; GFX8-NEXT: v_mov_b32_e32 v1, s12
+; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -3765,9 +3765,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s6, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -4284,8 +4284,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
@@ -4423,8 +4424,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
@@ -4637,9 +4639,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: v_mov_b32_e32 v8, s2
; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v4
@@ -4739,8 +4741,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0
@@ -4848,14 +4850,14 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v11, s3
; GFX8-NEXT: v_mov_b32_e32 v10, s2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
@@ -4968,8 +4970,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX8-NEXT: s_add_u32 s2, s0, 16
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0
@@ -5098,11 +5100,11 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: s_add_u32 s2, s0, 48
; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: s_add_u32 s4, s0, 32
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s5
+; GFX8-NEXT: s_addc_u32 s5, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v16, s5
; GFX8-NEXT: v_mov_b32_e32 v15, s4
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
@@ -5113,7 +5115,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1
; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1
; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7]
-; GFX8-NEXT: v_mov_b32_e32 v16, s3
+; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: s_add_u32 s0, s0, 16
@@ -5121,8 +5123,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v18, s1
; GFX8-NEXT: v_mov_b32_e32 v17, s0
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1
; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1
+; GFX8-NEXT: v_mov_b32_e32 v16, s3
; GFX8-NEXT: v_mov_b32_e32 v15, s2
; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v0
@@ -5292,25 +5294,25 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v19, s3
; GFX8-NEXT: v_mov_b32_e32 v18, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NEXT: v_mov_b32_e32 v7, s5
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v8, s6
; GFX8-NEXT: v_mov_b32_e32 v9, s7
; GFX8-NEXT: v_mov_b32_e32 v10, s8
; GFX8-NEXT: v_mov_b32_e32 v11, s9
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v12, s10
; GFX8-NEXT: v_mov_b32_e32 v13, s11
; GFX8-NEXT: v_mov_b32_e32 v14, s12
; GFX8-NEXT: v_mov_b32_e32 v15, s13
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s14
@@ -5869,26 +5871,26 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_addc_u32 s3, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v7, s19
; GFX8-NEXT: v_mov_b32_e32 v8, s20
; GFX8-NEXT: v_mov_b32_e32 v9, s21
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v10, s22
; GFX8-NEXT: v_mov_b32_e32 v11, s23
; GFX8-NEXT: v_mov_b32_e32 v12, s24
; GFX8-NEXT: v_mov_b32_e32 v13, s25
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9]
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v14, s26
; GFX8-NEXT: v_mov_b32_e32 v15, s27
; GFX8-NEXT: v_mov_b32_e32 v16, s28
; GFX8-NEXT: v_mov_b32_e32 v17, s29
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s30
@@ -6227,92 +6229,92 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s34
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0xd0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s33
-; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0xc0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s33
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s31
-; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0xb0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s31
+; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s30
-; GFX8-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0xa0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s30
+; GFX8-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s28
-; GFX8-NEXT: v_mov_b32_e32 v2, s29
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0x90
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s28
+; GFX8-NEXT: v_mov_b32_e32 v2, s29
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s27
-; GFX8-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0x80
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s27
+; GFX8-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s26
-; GFX8-NEXT: v_mov_b32_e32 v2, s13
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0x70
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s26
+; GFX8-NEXT: v_mov_b32_e32 v2, s13
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s25
-; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s25
+; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s24
-; GFX8-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 0x50
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s23
-; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 64
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s23
+; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NEXT: v_mov_b32_e32 v2, s17
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 48
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NEXT: v_mov_b32_e32 v2, s17
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s21
-; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_add_u32 s6, s0, 32
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s21
+; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v2, s19
@@ -6841,48 +6843,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v1, s45
; GFX8-NEXT: s_addc_u32 s45, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s44
-; GFX8-NEXT: v_mov_b32_e32 v2, s46
-; GFX8-NEXT: v_mov_b32_e32 v3, s47
; GFX8-NEXT: v_mov_b32_e32 v5, s45
; GFX8-NEXT: s_add_u32 s44, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s46
+; GFX8-NEXT: v_mov_b32_e32 v3, s47
; GFX8-NEXT: s_addc_u32 s45, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s44
+; GFX8-NEXT: v_mov_b32_e32 v5, s45
+; GFX8-NEXT: s_add_u32 s44, s0, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v0, s48
; GFX8-NEXT: v_mov_b32_e32 v1, s49
; GFX8-NEXT: v_mov_b32_e32 v2, s50
; GFX8-NEXT: v_mov_b32_e32 v3, s51
-; GFX8-NEXT: v_mov_b32_e32 v5, s45
-; GFX8-NEXT: s_add_u32 s44, s0, 0xd0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s45, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s44
+; GFX8-NEXT: v_mov_b32_e32 v5, s45
+; GFX8-NEXT: s_add_u32 s44, s0, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v0, s52
; GFX8-NEXT: v_mov_b32_e32 v1, s53
; GFX8-NEXT: v_mov_b32_e32 v2, s54
; GFX8-NEXT: v_mov_b32_e32 v3, s55
-; GFX8-NEXT: v_mov_b32_e32 v5, s45
-; GFX8-NEXT: s_add_u32 s44, s0, 0xc0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s45, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s44
+; GFX8-NEXT: v_mov_b32_e32 v5, s45
+; GFX8-NEXT: s_add_u32 s44, s0, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v0, s56
; GFX8-NEXT: v_mov_b32_e32 v1, s57
; GFX8-NEXT: v_mov_b32_e32 v2, s58
; GFX8-NEXT: v_mov_b32_e32 v3, s59
-; GFX8-NEXT: v_mov_b32_e32 v5, s45
-; GFX8-NEXT: s_add_u32 s44, s0, 0xb0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s45, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s44
+; GFX8-NEXT: v_mov_b32_e32 v5, s45
+; GFX8-NEXT: s_add_u32 s44, s0, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v0, s60
; GFX8-NEXT: v_mov_b32_e32 v1, s61
; GFX8-NEXT: v_mov_b32_e32 v2, s62
; GFX8-NEXT: v_mov_b32_e32 v3, s63
-; GFX8-NEXT: v_mov_b32_e32 v5, s45
-; GFX8-NEXT: s_add_u32 s44, s0, 0xa0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s45, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s44
; GFX8-NEXT: v_mov_b32_e32 v0, s64
; GFX8-NEXT: v_mov_b32_e32 v1, s65
@@ -6895,9 +6897,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s40, s0, 0x90
; GFX8-NEXT: v_mov_b32_e32 v3, s41
; GFX8-NEXT: s_addc_u32 s41, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s40
; GFX8-NEXT: v_mov_b32_e32 v0, s42
; GFX8-NEXT: v_mov_b32_e32 v1, s43
+; GFX8-NEXT: v_mov_b32_e32 v4, s40
; GFX8-NEXT: v_mov_b32_e32 v5, s41
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6905,9 +6907,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s36, s0, 0x80
; GFX8-NEXT: v_mov_b32_e32 v3, s37
; GFX8-NEXT: s_addc_u32 s37, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s36
; GFX8-NEXT: v_mov_b32_e32 v0, s38
; GFX8-NEXT: v_mov_b32_e32 v1, s39
+; GFX8-NEXT: v_mov_b32_e32 v4, s36
; GFX8-NEXT: v_mov_b32_e32 v5, s37
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6915,9 +6917,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s30, s0, 0x70
; GFX8-NEXT: v_mov_b32_e32 v3, s31
; GFX8-NEXT: s_addc_u32 s31, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NEXT: v_mov_b32_e32 v0, s34
; GFX8-NEXT: v_mov_b32_e32 v1, s35
+; GFX8-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NEXT: v_mov_b32_e32 v5, s31
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6925,9 +6927,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s26, s0, 0x60
; GFX8-NEXT: v_mov_b32_e32 v3, s27
; GFX8-NEXT: s_addc_u32 s27, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NEXT: v_mov_b32_e32 v0, s28
; GFX8-NEXT: v_mov_b32_e32 v1, s29
+; GFX8-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NEXT: v_mov_b32_e32 v5, s27
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6935,9 +6937,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s22, s0, 0x50
; GFX8-NEXT: v_mov_b32_e32 v3, s23
; GFX8-NEXT: s_addc_u32 s23, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NEXT: v_mov_b32_e32 v1, s25
+; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v5, s23
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6945,9 +6947,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s18, s0, 64
; GFX8-NEXT: v_mov_b32_e32 v3, s19
; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s21
+; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v5, s19
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6955,9 +6957,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s14, s0, 48
; GFX8-NEXT: v_mov_b32_e32 v3, s15
; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
+; GFX8-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NEXT: v_mov_b32_e32 v5, s15
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6965,9 +6967,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s10, s0, 32
; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
+; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -6975,9 +6977,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_add_u32 s6, s0, 16
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -7592,92 +7594,92 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v4, s42
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s66
-; GFX8-NEXT: v_mov_b32_e32 v2, s44
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x1e0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s66
+; GFX8-NEXT: v_mov_b32_e32 v2, s44
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s65
-; GFX8-NEXT: v_mov_b32_e32 v2, s45
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x1d0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s65
+; GFX8-NEXT: v_mov_b32_e32 v2, s45
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s64
-; GFX8-NEXT: v_mov_b32_e32 v2, s46
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x1c0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s64
+; GFX8-NEXT: v_mov_b32_e32 v2, s46
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s63
-; GFX8-NEXT: v_mov_b32_e32 v2, s47
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x1b0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s63
+; GFX8-NEXT: v_mov_b32_e32 v2, s47
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s62
-; GFX8-NEXT: v_mov_b32_e32 v2, s48
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x190
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s62
+; GFX8-NEXT: v_mov_b32_e32 v2, s48
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s61
-; GFX8-NEXT: v_mov_b32_e32 v2, s49
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x180
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s61
+; GFX8-NEXT: v_mov_b32_e32 v2, s49
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s60
-; GFX8-NEXT: v_mov_b32_e32 v2, s50
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x170
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s60
+; GFX8-NEXT: v_mov_b32_e32 v2, s50
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s59
-; GFX8-NEXT: v_mov_b32_e32 v2, s51
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x160
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s59
+; GFX8-NEXT: v_mov_b32_e32 v2, s51
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s58
-; GFX8-NEXT: v_mov_b32_e32 v2, s52
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x150
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s58
+; GFX8-NEXT: v_mov_b32_e32 v2, s52
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s57
-; GFX8-NEXT: v_mov_b32_e32 v2, s53
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x140
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s57
+; GFX8-NEXT: v_mov_b32_e32 v2, s53
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s56
-; GFX8-NEXT: v_mov_b32_e32 v2, s40
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x130
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s56
+; GFX8-NEXT: v_mov_b32_e32 v2, s40
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NEXT: v_mov_b32_e32 v0, s55
-; GFX8-NEXT: v_mov_b32_e32 v2, s38
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: s_add_u32 s42, s0, 0x120
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s55
+; GFX8-NEXT: v_mov_b32_e32 v2, s38
; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s42
; GFX8-NEXT: v_mov_b32_e32 v0, s54
; GFX8-NEXT: v_mov_b32_e32 v2, s37
@@ -7687,56 +7689,56 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s41
; GFX8-NEXT: s_addc_u32 s41, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s40
; GFX8-NEXT: v_mov_b32_e32 v2, s35
+; GFX8-NEXT: v_mov_b32_e32 v4, s40
; GFX8-NEXT: v_mov_b32_e32 v5, s41
; GFX8-NEXT: s_add_u32 s38, s0, 0x100
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s39
; GFX8-NEXT: s_addc_u32 s39, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s38
; GFX8-NEXT: v_mov_b32_e32 v2, s33
+; GFX8-NEXT: v_mov_b32_e32 v4, s38
; GFX8-NEXT: v_mov_b32_e32 v5, s39
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s36
; GFX8-NEXT: s_add_u32 s36, s0, 0xf0
; GFX8-NEXT: s_addc_u32 s37, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s36
; GFX8-NEXT: v_mov_b32_e32 v2, s30
+; GFX8-NEXT: v_mov_b32_e32 v4, s36
; GFX8-NEXT: v_mov_b32_e32 v5, s37
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s34
; GFX8-NEXT: s_add_u32 s34, s0, 0xe0
; GFX8-NEXT: s_addc_u32 s35, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s34
; GFX8-NEXT: v_mov_b32_e32 v2, s28
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
; GFX8-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NEXT: s_add_u32 s30, s0, 0xd0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s31
; GFX8-NEXT: s_addc_u32 s31, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NEXT: v_mov_b32_e32 v2, s26
+; GFX8-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NEXT: v_mov_b32_e32 v5, s31
; GFX8-NEXT: s_add_u32 s28, s0, 0xc0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s29
; GFX8-NEXT: s_addc_u32 s29, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s28
; GFX8-NEXT: v_mov_b32_e32 v2, s25
+; GFX8-NEXT: v_mov_b32_e32 v4, s28
; GFX8-NEXT: v_mov_b32_e32 v5, s29
; GFX8-NEXT: s_add_u32 s26, s0, 0xb0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s27
; GFX8-NEXT: s_addc_u32 s27, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NEXT: v_mov_b32_e32 v2, s22
+; GFX8-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NEXT: v_mov_b32_e32 v5, s27
; GFX8-NEXT: s_add_u32 s22, s0, 0xa0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -7744,11 +7746,11 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v0, s23
; GFX8-NEXT: s_addc_u32 s23, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s22
-; GFX8-NEXT: v_mov_b32_e32 v2, s24
; GFX8-NEXT: v_mov_b32_e32 v5, s23
; GFX8-NEXT: s_add_u32 s22, s0, 0x90
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s24
; GFX8-NEXT: s_addc_u32 s23, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, s19
@@ -7758,32 +7760,32 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: s_add_u32 s20, s0, 0x80
; GFX8-NEXT: s_addc_u32 s21, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v2, s17
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s21
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: s_add_u32 s18, s0, 0x70
; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v2, s15
+; GFX8-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NEXT: v_mov_b32_e32 v5, s19
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: s_add_u32 s16, s0, 0x60
; GFX8-NEXT: s_addc_u32 s17, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NEXT: v_mov_b32_e32 v2, s13
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NEXT: v_mov_b32_e32 v5, s17
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NEXT: s_add_u32 s12, s0, 0x50
; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -8909,10 +8911,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v18, s0
; GFX8-NEXT: s_add_u32 s0, s8, 0x110
; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mov_b32_e32 v15, s3
; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo
; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi
+; GFX8-NEXT: v_mov_b32_e32 v15, s3
; GFX8-NEXT: v_mov_b32_e32 v14, s2
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: v_mov_b32_e32 v7, s7
@@ -9076,14 +9078,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_readlane_b32 s0, v62, 4
; GFX8-NEXT: v_mov_b32_e32 v0, s36
; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_readlane_b32 s0, v62, 4
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_readlane_b32 s1, v62, 5
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NEXT: v_mov_b32_e32 v0, s30
; GFX8-NEXT: v_mov_b32_e32 v1, s31
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 2afac4e90aa407..075dd28b477df9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -182,10 +182,10 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-NEXT: s_add_u32 s4, s0, 4
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2
; GCN-HSA-NEXT: flat_store_short v[2:3], v4
@@ -200,10 +200,10 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s2
; GCN-NOHSA-VI-NEXT: flat_store_short v[2:3], v4
@@ -325,9 +325,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -403,10 +403,10 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -451,12 +451,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_endpgm
@@ -475,12 +475,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -631,6 +631,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5]
@@ -639,7 +640,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[12:13]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[14:15]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24
@@ -669,8 +669,8 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v5, v[6:7]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_load_ushort v8, v[8:9]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11]
@@ -1701,9 +1701,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -1733,9 +1733,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
@@ -1853,16 +1853,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_ashr_i32 s3, s6, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7
; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
+; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
-; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -1885,16 +1885,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s6, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
@@ -2298,27 +2298,29 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9
; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7
+; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7
-; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
+; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
@@ -2326,8 +2328,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
-; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -2356,27 +2356,29 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
@@ -2384,8 +2386,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
@@ -3099,10 +3099,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12
; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15
; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13
@@ -3113,24 +3113,24 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 64
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9
; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
@@ -3138,10 +3138,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7
; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
@@ -3149,19 +3149,19 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5
; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 16
+; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3
+; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3
-; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -3207,18 +3207,18 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
@@ -3226,10 +3226,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
@@ -3237,10 +3237,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
@@ -3248,10 +3248,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
@@ -3259,19 +3259,19 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
@@ -3750,10 +3750,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6
@@ -3768,10 +3768,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -4629,9 +4629,9 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2
; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11
; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2
; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
@@ -4639,24 +4639,24 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63
; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2
; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61
; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12
+; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15]
+; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80
-; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0
; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15
; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12
@@ -4687,8 +4687,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -4700,8 +4700,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18
@@ -4814,18 +4814,18 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
@@ -4833,10 +4833,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
@@ -4844,10 +4844,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
@@ -4855,10 +4855,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
@@ -4866,10 +4866,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
@@ -4877,10 +4877,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
@@ -4897,10 +4897,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
@@ -4908,10 +4908,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s29
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s28
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
@@ -4919,10 +4919,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s27, s27
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s26
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
@@ -4930,10 +4930,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s25
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s24
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24
@@ -4941,10 +4941,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s23, s23
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s22
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
@@ -4952,19 +4952,21 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s21
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s20
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39
; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
@@ -4972,8 +4974,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
@@ -5887,8 +5887,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -6023,9 +6023,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -6054,9 +6054,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
@@ -6460,21 +6460,21 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
@@ -7065,36 +7065,36 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_addc_u32 s29, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s28
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s29
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11
; GCN-HSA-NEXT: s_add_u32 s10, s0, 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s28
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s29
+; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: s_add_u32 s10, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GCN-HSA-NEXT: s_add_u32 s10, s0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35
@@ -7107,9 +7107,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s8, s0, 64
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -7117,9 +7117,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s4, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -7170,30 +7170,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
@@ -7560,82 +7560,82 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26
; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
-; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23
-; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 48
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s16, 64
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34
@@ -7645,8 +7645,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_add_u32 s2, s16, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
@@ -7703,85 +7703,85 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xe0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22
; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21
@@ -8294,9 +8294,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49
; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46
; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47
; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0
@@ -8309,29 +8309,28 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43
; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50
; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
; GCN-HSA-NEXT: s_add_u32 s38, s16, 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
+; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
-; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39
; GCN-HSA-NEXT: s_add_u32 s38, s16, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43
; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14
; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0
@@ -8339,6 +8338,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15
; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0
@@ -8347,24 +8347,24 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12
; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13
-; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38
-; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13
+; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
@@ -8374,9 +8374,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -8384,9 +8384,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x80
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -8394,9 +8394,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -8404,9 +8404,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s4, s16, 64
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_nop 0
@@ -8414,9 +8414,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-HSA-NEXT: s_add_u32 s2, s16, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
@@ -8494,9 +8494,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61
; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
@@ -8504,9 +8504,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
@@ -8514,9 +8514,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55
; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
@@ -8524,9 +8524,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
@@ -8534,9 +8534,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49
; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_nop 0
@@ -8545,12 +8545,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39
; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39
; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47
; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45
@@ -8564,12 +8564,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25
; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25
; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41
; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
@@ -8583,12 +8583,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21
; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31
; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 4ce3b46211e64a..2b08f88b5354a6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -159,9 +159,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -346,10 +346,10 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -394,12 +394,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: s_endpgm
@@ -418,12 +418,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -525,15 +525,15 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-HSA-NEXT: flat_store_dword v[4:5], v6
; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: flat_store_dword v[4:5], v6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
@@ -556,15 +556,15 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: flat_store_dword v[4:5], v6
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: flat_store_dword v[4:5], v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
@@ -684,18 +684,18 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32
; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13
+; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-HSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7]
; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
@@ -716,18 +716,18 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 32
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7]
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
@@ -796,13 +796,13 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v10, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v9, s13
; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32
; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16
@@ -861,18 +861,19 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s0
+; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
; GFX7-HSA-NEXT: s_endpgm
@@ -894,18 +895,19 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1029,27 +1031,28 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32
; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX7-HSA-NEXT: s_nop 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -1063,27 +1066,28 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 32
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1147,14 +1151,14 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v12, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s13
; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
-; GFX12-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14
-; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4
-; GFX12-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6
-; GFX12-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
-; GFX12-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s1
+; GFX12-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s3
+; GFX12-NEXT: v_mov_b32_e32 v10, s2
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32
; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16
@@ -1210,27 +1214,27 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s19
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
@@ -1252,27 +1256,27 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s19
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
@@ -2093,16 +2097,16 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s2
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: s_endpgm
@@ -2121,16 +2125,16 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -2467,9 +2471,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: s_add_u32 s6, s8, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -2477,9 +2481,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: s_add_u32 s4, s8, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -2487,9 +2491,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: s_add_u32 s2, s8, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
@@ -2519,9 +2523,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2529,9 +2533,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2539,9 +2543,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
@@ -2784,19 +2788,19 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33
; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6]
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18
@@ -2804,9 +2808,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19
@@ -2814,23 +2818,23 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14
; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9]
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1
@@ -2866,9 +2870,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0x70
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2876,9 +2880,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13
; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2886,9 +2890,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2896,9 +2900,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s26
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2906,9 +2910,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2916,9 +2920,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -2926,9 +2930,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
@@ -3699,8 +3703,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35
; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34
; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26]
; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35
; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0
@@ -3708,16 +3713,15 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35
; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26]
; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35
; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90
; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28
; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7]
@@ -3732,14 +3736,14 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38
; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25
-; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11]
; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28
; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11]
; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29
; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s23
; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s22
@@ -3751,7 +3755,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23]
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49
@@ -3759,16 +3762,17 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25
; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47
; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6]
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9]
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9
@@ -3778,9 +3782,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -3788,9 +3792,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -3798,9 +3802,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
@@ -3855,9 +3859,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s30, s36, 0xf0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31
; GFX8-NOHSA-NEXT: s_addc_u32 s31, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s65
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3865,9 +3869,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s28, s36, 0xe0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29
; GFX8-NOHSA-NEXT: s_addc_u32 s29, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s29
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3875,9 +3879,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s26, s36, 0xd0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27
; GFX8-NOHSA-NEXT: s_addc_u32 s27, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3885,9 +3889,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s24, s36, 0xc0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NOHSA-NEXT: s_addc_u32 s25, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s25
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3895,9 +3899,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s22, s36, 0xb0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23
; GFX8-NOHSA-NEXT: s_addc_u32 s23, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s58
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s23
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3905,9 +3909,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s20, s36, 0xa0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21
; GFX8-NOHSA-NEXT: s_addc_u32 s21, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3915,9 +3919,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s18, s36, 0x90
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
; GFX8-NOHSA-NEXT: s_addc_u32 s19, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s19
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3925,9 +3929,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s16, s36, 0x80
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17
; GFX8-NOHSA-NEXT: s_addc_u32 s17, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3935,9 +3939,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s14, s36, 0x70
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NOHSA-NEXT: s_addc_u32 s15, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3945,9 +3949,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s12, s36, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13
; GFX8-NOHSA-NEXT: s_addc_u32 s13, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3955,9 +3959,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s10, s36, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3965,9 +3969,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s8, s36, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: s_addc_u32 s9, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3975,9 +3979,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s6, s36, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3985,9 +3989,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s4, s36, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -3995,9 +3999,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX8-NOHSA-NEXT: s_add_u32 s2, s36, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36
@@ -4496,22 +4500,22 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1
-; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90
-; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
+; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0
@@ -5123,18 +5127,19 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX7-HSA-NEXT: s_nop 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19
; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
@@ -5143,9 +5148,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -5153,9 +5158,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -5163,9 +5168,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
@@ -5197,18 +5202,18 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s25
; GFX8-NOHSA-NEXT: s_addc_u32 s25, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s27
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NOHSA-NEXT: s_add_u32 s20, s36, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21
; GFX8-NOHSA-NEXT: s_addc_u32 s21, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -5216,9 +5221,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: s_add_u32 s16, s36, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NOHSA-NEXT: s_addc_u32 s17, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -5226,9 +5231,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: s_add_u32 s12, s36, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NOHSA-NEXT: s_addc_u32 s13, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -5236,9 +5241,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: s_add_u32 s8, s36, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: s_addc_u32 s9, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -5246,9 +5251,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX8-NOHSA-NEXT: s_add_u32 s4, s36, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s37, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index b3e75e767ae641..12d9b129e8697c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -66,9 +66,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%ld = load i64, ptr addrspace(4) %in
@@ -143,10 +143,10 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -189,12 +189,12 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, s8
; GFX7-NEXT: v_mov_b32_e32 v6, s9
-; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
-; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
+; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
; GFX7-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
; GFX7-NEXT: v_mov_b32_e32 v4, s0
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: s_endpgm
@@ -212,12 +212,12 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: v_mov_b32_e32 v6, s9
-; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -305,12 +305,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
-; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-NEXT: v_mov_b32_e32 v4, s0
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-NEXT: v_mov_b32_e32 v6, s2
; GFX7-NEXT: v_mov_b32_e32 v7, s3
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-NEXT: s_endpgm
@@ -329,12 +329,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v6, s2
; GFX8-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
@@ -429,27 +429,27 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-NEXT: v_mov_b32_e32 v6, s18
; GFX7-NEXT: v_mov_b32_e32 v7, s19
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: s_add_u32 s8, s16, 32
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
; GFX7-NEXT: v_mov_b32_e32 v2, s14
; GFX7-NEXT: v_mov_b32_e32 v3, s15
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-NEXT: s_add_u32 s8, s16, 32
; GFX7-NEXT: v_mov_b32_e32 v5, s9
-; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: v_mov_b32_e32 v7, s11
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_add_u32 s4, s16, 16
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: v_mov_b32_e32 v4, s16
@@ -471,27 +471,27 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
; GFX8-NEXT: v_mov_b32_e32 v6, s18
; GFX8-NEXT: v_mov_b32_e32 v7, s19
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: s_add_u32 s8, s16, 32
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: s_add_u32 s8, s16, 32
; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NEXT: s_addc_u32 s9, s17, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: v_mov_b32_e32 v7, s11
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_add_u32 s4, s16, 16
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s5, s17, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s16
@@ -664,18 +664,19 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: v_mov_b32_e32 v16, s18
; GFX7-NEXT: v_mov_b32_e32 v17, s19
; GFX7-NEXT: s_add_u32 s18, s16, 0x60
-; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
; GFX7-NEXT: s_add_u32 s18, s16, 0x50
-; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
; GFX7-NEXT: s_add_u32 s18, s16, 64
-; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX7-NEXT: s_addc_u32 s19, s17, 0
+; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
@@ -684,9 +685,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_add_u32 s12, s16, 48
; GFX7-NEXT: v_mov_b32_e32 v1, s13
; GFX7-NEXT: s_addc_u32 s13, s17, 0
-; GFX7-NEXT: v_mov_b32_e32 v4, s12
; GFX7-NEXT: v_mov_b32_e32 v2, s14
; GFX7-NEXT: v_mov_b32_e32 v3, s15
+; GFX7-NEXT: v_mov_b32_e32 v4, s12
; GFX7-NEXT: v_mov_b32_e32 v5, s13
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: s_nop 0
@@ -694,9 +695,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_add_u32 s8, s16, 32
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-NEXT: v_mov_b32_e32 v4, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s11
+; GFX7-NEXT: v_mov_b32_e32 v4, s8
; GFX7-NEXT: v_mov_b32_e32 v5, s9
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: s_nop 0
@@ -704,9 +705,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: s_add_u32 s4, s16, 16
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s5
; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-NEXT: v_mov_b32_e32 v4, s16
@@ -738,18 +739,18 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v5, s25
; GFX8-NEXT: s_addc_u32 s25, s37, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NEXT: v_mov_b32_e32 v6, s26
; GFX8-NEXT: v_mov_b32_e32 v7, s27
+; GFX8-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NEXT: v_mov_b32_e32 v1, s25
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: s_add_u32 s20, s36, 0x50
; GFX8-NEXT: v_mov_b32_e32 v1, s21
; GFX8-NEXT: s_addc_u32 s21, s37, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NEXT: v_mov_b32_e32 v3, s23
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s21
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -757,9 +758,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_add_u32 s16, s36, 64
; GFX8-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NEXT: s_addc_u32 s17, s37, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: v_mov_b32_e32 v3, s19
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NEXT: v_mov_b32_e32 v5, s17
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -767,9 +768,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_add_u32 s12, s36, 48
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: s_addc_u32 s13, s37, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -777,9 +778,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_add_u32 s8, s36, 32
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: s_addc_u32 s9, s37, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_nop 0
@@ -787,9 +788,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_add_u32 s4, s36, 16
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_addc_u32 s5, s37, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v4, s36
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b945c7c3def6ad..d74c8e019973a0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -418,9 +418,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -496,10 +496,10 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1468,9 +1468,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -1500,9 +1500,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
@@ -1627,9 +1627,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -1659,9 +1659,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
@@ -2079,24 +2079,25 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
+; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
@@ -2104,7 +2105,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -2137,24 +2137,25 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
@@ -2162,7 +2163,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
@@ -2889,9 +2889,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3
; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10
; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11
@@ -2901,22 +2901,22 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
@@ -2924,9 +2924,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
@@ -2934,17 +2934,18 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
+; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
@@ -2952,7 +2953,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
@@ -2997,16 +2997,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
@@ -3014,9 +3014,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
@@ -3024,9 +3024,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
@@ -3034,9 +3034,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
@@ -3044,17 +3044,18 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
@@ -3062,7 +3063,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
@@ -3546,10 +3546,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70
; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
@@ -3574,29 +3574,29 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61
; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46
; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59
; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26
@@ -3726,93 +3726,93 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s67
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s66
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s65
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s63
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s61
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s59
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s57
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33
@@ -3825,13 +3825,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s55
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26
@@ -4426,8 +4426,8 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9
; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13
@@ -4435,12 +4435,12 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60
; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8
; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58
; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57
@@ -4467,49 +4467,49 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19]
-; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14
+; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6
; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60
+; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14
; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6
+; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
+; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49
; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46
; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
+; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37
+; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6
-; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50
-; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37
; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35
@@ -4521,15 +4521,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28
@@ -4547,6 +4547,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24
; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0
+; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1
@@ -4554,7 +4555,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -4623,107 +4623,107 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_bfe_i32 s66, s15, 0x80008
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xf0
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s14
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s65
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s13, s13
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s61
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s60
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s12
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s55
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s54
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s49
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s46
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s40
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s35
@@ -4735,15 +4735,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28
@@ -4761,6 +4761,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
@@ -4768,7 +4769,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
@@ -5868,9 +5868,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -5899,9 +5899,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
@@ -6937,45 +6937,45 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10
; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11
+; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50
+; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50
+; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25
@@ -6988,9 +6988,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -7041,48 +7041,48 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 64
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 48
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27
@@ -7095,9 +7095,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
@@ -7441,82 +7441,82 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
-; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7
; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28
; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29
@@ -7584,92 +7584,92 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xb0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xe0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s35
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xd0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s35
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xc0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xa0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x90
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x80
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28
@@ -8206,9 +8206,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45
; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44
-; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64
; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45
; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64
; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65
; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
@@ -8216,13 +8216,13 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42
; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43
; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50
; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51
; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
@@ -8231,40 +8231,41 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29
; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42
; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29
; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80
; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62
; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63
; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48
; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49
-; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44
+; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43
; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61
-; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45
; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18
; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61
+; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44
+; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45
; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s19
; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19
; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60
; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46
+; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19
+; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50
; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58
; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41
-; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28
-; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19
-; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50
+; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28
; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38
; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39
@@ -8278,12 +8279,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
-; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -8291,9 +8291,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s12, s8, 64
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_nop 0
@@ -8400,30 +8400,30 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47
; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s74
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s75
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xe0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s74
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s75
; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
-; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47
-; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s72
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s73
@@ -8437,12 +8437,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NOHSA-NEXT: s_addc_u32 s43, s9, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s42
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NOHSA-NEXT: s_add_u32 s42, s8, 0xa0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59
; GFX8-NOHSA-NEXT: s_addc_u32 s43, s9, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s42
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63
@@ -8455,9 +8455,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s40, s8, 0x90
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41
; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8465,9 +8465,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s34, s8, 0x80
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NOHSA-NEXT: s_addc_u32 s35, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s34
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s34
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s35
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8476,12 +8476,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27
; GFX8-NOHSA-NEXT: s_addc_u32 s27, s9, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27
; GFX8-NOHSA-NEXT: s_add_u32 s26, s8, 0x60
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NOHSA-NEXT: s_addc_u32 s27, s9, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29
@@ -8494,9 +8494,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s22, s8, 0x50
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23
; GFX8-NOHSA-NEXT: s_addc_u32 s23, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s23
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8504,9 +8504,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s16, s8, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
; GFX8-NOHSA-NEXT: s_addc_u32 s17, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8514,9 +8514,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s12, s8, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13
; GFX8-NOHSA-NEXT: s_addc_u32 s13, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8524,9 +8524,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
@@ -8534,9 +8534,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
@@ -10228,14 +10228,14 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3
; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -10281,15 +10281,15 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -10594,9 +10594,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -10667,9 +10667,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
@@ -11064,8 +11064,8 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0
@@ -11079,13 +11079,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -11162,8 +11162,8 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32
; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
@@ -11179,15 +11179,15 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 8589158f11a708..80f8059856289b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -226,8 +226,8 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-NEXT: s_add_u32 s2, s0, 4
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_short v[4:5], v1
@@ -525,8 +525,8 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
@@ -535,8 +535,8 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
@@ -675,8 +675,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
@@ -685,8 +685,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
@@ -1841,8 +1841,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3
@@ -1990,8 +1990,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3
@@ -2151,8 +2151,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2161,14 +2161,14 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1
@@ -2388,8 +2388,8 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
@@ -2397,13 +2397,13 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
@@ -2660,16 +2660,16 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 48
-; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
@@ -2696,23 +2696,22 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7
@@ -2722,6 +2721,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
@@ -3068,19 +3068,19 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 48
+; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
+; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
@@ -3095,9 +3095,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
@@ -3110,13 +3110,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14
; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18]
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
@@ -3129,8 +3129,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14]
; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
@@ -3142,16 +3140,18 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14]
; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10]
-; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16
@@ -3594,11 +3594,11 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCN-HSA-NEXT: s_add_u32 s8, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9
; GCN-HSA-NEXT: s_add_u32 s10, s2, 64
+; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8
; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17]
-; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50
@@ -3609,8 +3609,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3618,8 +3618,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13]
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6
; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13]
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15]
@@ -3637,9 +3637,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0
+; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35]
; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3
-; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
@@ -3663,21 +3663,21 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1
; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8
; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9
; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10
; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11
; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27]
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5
@@ -3685,14 +3685,15 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7
; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6
+; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
@@ -3700,9 +3701,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27]
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30
; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31
@@ -3712,7 +3713,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2
@@ -3723,10 +3723,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -3758,18 +3758,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -4396,27 +4397,27 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60
+; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50
-; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 64
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
@@ -4478,13 +4479,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v13
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v12
; GCN-HSA-NEXT: v_bfe_i32 v22, v13, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v15
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14
; GCN-HSA-NEXT: v_bfe_i32 v30, v15, 0, 16
@@ -4505,14 +4507,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
@@ -4560,20 +4561,20 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25
; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v27
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v26
; GCN-HSA-NEXT: v_bfe_i32 v6, v27, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v26, 0, 16
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_endpgm
@@ -5767,14 +5768,14 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9
@@ -5914,8 +5915,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v2
@@ -6080,18 +6081,18 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -6295,13 +6296,13 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
@@ -6585,21 +6586,17 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2
; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8
@@ -6607,9 +6604,13 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0
@@ -6947,13 +6948,13 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48
@@ -7392,8 +7393,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
+; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
@@ -7401,8 +7402,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 48
-; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
@@ -7420,8 +7421,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14
; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50
; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1
@@ -7461,9 +7462,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_waitcnt vmcnt(5)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20]
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9
-; GCN-HSA-NEXT: s_add_u32 s4, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15
@@ -7473,19 +7474,19 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0
; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20]
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12]
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13
@@ -7493,47 +7494,47 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v14
-; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
+; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v8
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1
-; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
@@ -8138,14 +8139,14 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50
-; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11
-; GCN-HSA-NEXT: s_add_u32 s8, s0, 32
+; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48
-; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
+; GCN-HSA-NEXT: s_add_u32 s8, s0, 32
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10
+; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9
; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16
@@ -8183,25 +8184,26 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5
; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[14:15], 48
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v14
-; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16
-; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v12
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v14
+; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v12
+; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16
@@ -8209,53 +8211,52 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_bfe_i32 v20, v0, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v24, v2, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v6
; GCN-HSA-NEXT: v_bfe_i32 v22, v22, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GCN-HSA-NEXT: v_bfe_i32 v9, v23, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_bfe_i32 v3, v4, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
+; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0573de4a7f2d1d..4e0c7b9fe0184c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -460,27 +460,27 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: flat_load_dword v14, v[8:9]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -591,27 +591,27 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
@@ -721,27 +721,27 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
@@ -856,27 +856,27 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
@@ -993,14 +993,14 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 32
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1011,14 +1011,14 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
@@ -1807,8 +1807,8 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
@@ -1947,8 +1947,8 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1957,11 +1957,11 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
@@ -2139,8 +2139,8 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
@@ -2148,13 +2148,13 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
-; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1)
@@ -2373,19 +2373,19 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
@@ -2400,9 +2400,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
@@ -2415,13 +2415,13 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
-; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v15
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v14
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2
+; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
@@ -2434,8 +2434,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v10
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v11
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
-; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15]
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
@@ -2447,16 +2445,18 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6
; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1
+; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6)
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
@@ -2792,12 +2792,12 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 48
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2818,42 +2818,42 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
+; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9
-; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10
; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -3191,28 +3191,28 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13]
-; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
@@ -3341,28 +3341,28 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[16:19]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2
+; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v15
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v14
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v14
; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v15
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s2
-; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v0
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3
; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
; GCNX3-HSA-NEXT: s_endpgm
@@ -3649,7 +3649,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
@@ -3657,6 +3656,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
+; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2
@@ -3774,12 +3774,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6)
-; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v2
; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3
; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2
+; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v2
; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v3
-; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v25
; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v26
+; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v25
; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v27
; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v28
; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(4)
@@ -3827,13 +3827,13 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6
; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6
; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
-; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
-; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a3
; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v1
+; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
+; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
+; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a3
; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a2
; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a1
; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v32, a0
@@ -3972,15 +3972,15 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64
; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCNX3-HSA-NEXT: s_add_u32 s12, s2, 0x50
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s13, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s14, s2, 0x60
; GCNX3-HSA-NEXT: s_addc_u32 s15, s3, 0
@@ -4002,8 +4002,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s7
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s6
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
@@ -4061,24 +4061,23 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
-; GCNX3-HSA-NEXT: s_nop 0
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26
; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27
; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20
; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3]
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22
; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
@@ -4112,16 +4111,16 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14
; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48
+; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6
@@ -4497,12 +4496,12 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5
-; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4
@@ -4515,16 +4514,16 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7
+; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
-; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
@@ -4532,10 +4531,10 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0
; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0
; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
@@ -4562,12 +4561,12 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11]
; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4
; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7)
; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 989ef6f981d9d6..ffee9635c05b50 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -81,13 +81,13 @@ define amdgpu_kernel void @caller() {
; GFX9-SDAG-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
; GFX9-SDAG-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x0
-; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12
; GFX9-SDAG-NEXT: s_mov_b32 s32, 0
@@ -111,13 +111,13 @@ define amdgpu_kernel void @caller() {
; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
; GFX9-GISEL-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
-; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13]
; GFX9-GISEL-NEXT: s_mov_b32 s12, s14
@@ -140,13 +140,13 @@ define amdgpu_kernel void @caller() {
; GFX9ARCH-SDAG-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
; GFX9ARCH-SDAG-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
-; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9
; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0
@@ -169,13 +169,13 @@ define amdgpu_kernel void @caller() {
; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee at gotpcrel32@lo+4
; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee at gotpcrel32@hi+12
; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0
-; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13]
; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index d4f75051b04d49..9170168531e33a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -962,8 +962,9 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3
; GFX11-NEXT: s_add_u32 s2, s6, s4
; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
@@ -980,8 +981,9 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3]
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%ext0 = zext i32 %arg0 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
index af713179a888dd..607c3cbfce6166 100644
--- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -93,12 +93,12 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
@@ -176,12 +176,12 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1f
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 8157b1a7f7c802..523b35732433e4 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -44,9 +44,9 @@ define void @issue63986(i64 %0, i64 %idxprom) {
; CHECK-NEXT: flat_load_ubyte v10, v[6:7]
; CHECK-NEXT: v_mov_b32_e32 v9, s5
; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s4, v2
-; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v9, vcc
; CHECK-NEXT: s_add_u32 s4, s4, 1
+; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -161,8 +161,8 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB1_8
; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_mov_b64 s[6:7], 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual
; CHECK-NEXT: s_add_u32 s4, s6, 1
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 127656f7aa626c..3e4616becde4c7 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -99,7 +99,6 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -107,6 +106,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT: s_add_u32 s16, s16, s15
; CHECK-NEXT: s_addc_u32 s17, s17, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
@@ -384,7 +384,6 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -392,6 +391,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT: s_add_u32 s16, s16, s15
; CHECK-NEXT: s_addc_u32 s17, s17, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index a68d2e575607d4..70bb266c0d7b2a 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -32,10 +32,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -136,9 +136,9 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
@@ -180,10 +180,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB1_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -284,9 +284,9 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
@@ -330,8 +330,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_cbranch_execz .LBB2_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v1
-; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: v_mov_b32_e32 v9, v0
+; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: v_mov_b32_e32 v11, v5
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: s_mov_b32 s9, 0
@@ -428,8 +428,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: v_add_co_u32 v9, vcc_lo, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
-; CHECK-NEXT: v_mov_b32_e32 v5, v7
; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2
+; CHECK-NEXT: v_mov_b32_e32 v5, v7
; CHECK-NEXT: v_mov_b32_e32 v6, v8
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -472,10 +472,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB3_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -576,9 +576,9 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
@@ -622,8 +622,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: s_cbranch_execz .LBB4_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v1
-; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: v_mov_b32_e32 v9, v0
+; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: v_mov_b32_e32 v11, v5
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: s_mov_b32 s9, 0
@@ -728,8 +728,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -772,10 +772,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB5_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -876,9 +876,9 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
@@ -918,10 +918,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB6_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -1022,9 +1022,9 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
@@ -1130,10 +1130,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB8_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
-; CHECK-NEXT: v_mov_b32_e32 v11, v1
-; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v10, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -1234,9 +1234,9 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
@@ -1353,8 +1353,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB10_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v2
-; CHECK-NEXT: v_mov_b32_e32 v12, v8
; CHECK-NEXT: v_mov_b32_e32 v9, v1
+; CHECK-NEXT: v_mov_b32_e32 v12, v8
; CHECK-NEXT: v_mov_b32_e32 v11, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: s_mov_b32 s9, 0
@@ -1824,8 +1824,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB15_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v2
-; CHECK-NEXT: v_mov_b32_e32 v12, v8
; CHECK-NEXT: v_mov_b32_e32 v9, v1
+; CHECK-NEXT: v_mov_b32_e32 v12, v8
; CHECK-NEXT: v_mov_b32_e32 v11, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: s_mov_b32 s8, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 15f31b4e86dbe5..6d04086cef8438 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -391,14 +391,14 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000
+; GCN-NEXT: s_brev_b32 s0, 1
; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_brev_b32 s0, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_mov_b32 s3, 0
; GCN-NEXT: s_mov_b32 s1, s0
; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index af7f92798a9319..dbcb4cadd011cc 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -293,11 +293,11 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; CI-NEXT: s_min_i32 s2, s2, s6
; CI-NEXT: s_min_i32 s1, s1, s5
; CI-NEXT: s_min_i32 s0, s0, s4
-; CI-NEXT: v_mov_b32_e32 v4, s8
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_mov_b32_e32 v4, s8
; CI-NEXT: v_mov_b32_e32 v5, s9
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
@@ -311,11 +311,11 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; VI-NEXT: s_min_i32 s2, s2, s6
; VI-NEXT: s_min_i32 s1, s1, s5
; VI-NEXT: s_min_i32 s0, s0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -927,9 +927,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
; CI-NEXT: s_or_b32 s1, s1, s7
; CI-NEXT: s_or_b32 s0, s0, s3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -957,9 +957,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; VI-NEXT: s_lshl_b32 s2, s7, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1359,9 +1359,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s0, s0, s2
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1373,9 +1373,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_min_i32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2801,9 +2801,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
; CI-NEXT: s_add_u32 s2, s0, 16
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v0, s9
; CI-NEXT: v_mov_b32_e32 v1, s8
+; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2832,9 +2832,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
; VI-NEXT: s_add_u32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v1, s8
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -3110,11 +3110,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; CI-NEXT: s_or_b32 s2, s2, s6
; CI-NEXT: s_or_b32 s1, s1, s5
; CI-NEXT: s_or_b32 s0, s0, s4
-; CI-NEXT: v_mov_b32_e32 v4, s8
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_mov_b32_e32 v4, s8
; CI-NEXT: v_mov_b32_e32 v5, s9
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
@@ -3156,11 +3156,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; VI-NEXT: s_or_b32 s2, s2, s6
; VI-NEXT: s_or_b32 s1, s1, s5
; VI-NEXT: s_or_b32 s0, s0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 43e3a1fa294838..8b0bf0fc7d4d9b 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -91,15 +91,15 @@ define amdgpu_kernel void @withcall() {
; GFX9-NEXT: s_add_u32 s4, s4, nonkernel at gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, nonkernel at gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21]
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b32 v3, v3 offset:8
@@ -156,15 +156,15 @@ define amdgpu_kernel void @withcall() {
; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel at gotpcrel32@lo+4
; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel at gotpcrel32@hi+12
; G_GFX9-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0
+; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; G_GFX9-NEXT: s_mov_b32 s14, s10
; G_GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; G_GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
-; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21]
; G_GFX9-NEXT: v_mov_b32_e32 v3, 0
; G_GFX9-NEXT: v_mov_b32_e32 v4, 8
; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21]
; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23]
; G_GFX9-NEXT: s_mov_b64 s[4:5], s[12:13]
; G_GFX9-NEXT: s_mov_b32 s12, s16
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 0adcb73422feff..7d85446ac84b08 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -340,8 +340,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: v_mov_b32_e32 v1, s7
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_add_u32 s0, s4, 0x3039
@@ -363,8 +363,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_add_u32 s0, s4, 0x3039
@@ -458,12 +458,12 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_or_b32 s0, s2, 63
-; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_add_u32 s0, s8, 63
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_addc_u32 s1, s9, 0
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
@@ -480,12 +480,12 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_or_b32 s0, s2, 63
-; GFX8-NEXT: s_mov_b32 s5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_add_u32 s0, s8, 63
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_addc_u32 s1, s9, 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index 2e9f09ad41813d..7c63edee0fb584 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Check that no attributes are added to graphics functions
; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s
@@ -40,6 +40,7 @@ define amdgpu_cs void @test_simple_indirect_call() {
; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
+;
; GFX10-LABEL: test_simple_indirect_call:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_getpc_b64 s[8:9]
@@ -51,8 +52,8 @@ define amdgpu_cs void @test_simple_indirect_call() {
; GFX10-NEXT: s_bitset0_b32 s11, 21
; GFX10-NEXT: s_add_u32 s8, s8, s0
; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a1197aeace86f0..c86ff60fea519e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -353,8 +353,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -1041,8 +1041,8 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -1356,8 +1356,8 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -1580,8 +1580,8 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -1787,8 +1787,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41]
; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -2057,8 +2057,8 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
@@ -2393,8 +2393,8 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj at gotpcrel32@hi+12
; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: v_mov_b32_e32 v31, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index afe1f33d15e422..7b52944f835c35 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -184,9 +184,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v19, v25, v27
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_mov_b32_e32 v19, v9
; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v19, v9
; GFX9-NEXT: v_mov_b32_e32 v18, v8
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB0_3
@@ -1658,9 +1658,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v21, v23, v25
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
; GFX9-NEXT: v_and_b32_e32 v12, 1, v30
-; GFX9-NEXT: v_mov_b32_e32 v21, v13
; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v21, v13
; GFX9-NEXT: v_mov_b32_e32 v20, v12
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB1_3
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 017b37af4cdf26..5a696a7f7c9221 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -220,8 +220,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index db56589b799dda..f1350540d495a2 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -182,11 +182,11 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: v_mov_b32_e32 v1, s14
; GFX8-NEXT: v_mov_b32_e32 v4, s13
; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
-; GFX8-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index b4eb7750081222..f115f0ce192f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -351,14 +351,14 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s12, s4, s6
-; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_addc_u32 s13, s5, s7
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
-; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
; SI-NEXT: s_mov_b32 s0, s2
@@ -377,9 +377,9 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -437,8 +437,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_addc_u32 s9, s5, s7
; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s4, s6, s4
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 96dd6276f7e382..c10f5445a7d69c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -217,9 +217,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s4, s6, s4
; GCN-IR-NEXT: s_subb_u32 s5, s7, s5
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: v_mov_b32_e32 v1, s5
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
@@ -418,22 +418,22 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
@@ -1287,9 +1287,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s4, s6, s4
; GCN-IR-NEXT: s_subb_u32 s5, s7, s5
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: v_mov_b32_e32 v1, s5
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
@@ -1458,22 +1458,22 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
@@ -1653,22 +1653,22 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
@@ -1761,11 +1761,11 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8
; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1
; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1
; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v3
; GCN-IR-NEXT: v_mov_b32_e32 v8, v2
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 459ef648fd806c..807492be67b39e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2106,8 +2106,8 @@ define void @crash_lshlrevb16_not_reg_op() {
; NOSDWA-NEXT: .LBB22_1: ; %bb1
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
; NOSDWA-NEXT: s_lshr_b32 s6, 0x100, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_mov_b64 s[4:5], 1
; NOSDWA-NEXT: v_mov_b32_e32 v2, s6
@@ -2126,8 +2126,8 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX89-NEXT: .LBB22_1: ; %bb1
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX89-NEXT: s_lshl_b32 s6, s4, 3
-; GFX89-NEXT: v_mov_b32_e32 v0, s4
; GFX89-NEXT: s_lshr_b32 s6, 0x100, s6
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_mov_b64 s[4:5], 1
; GFX89-NEXT: v_mov_b32_e32 v2, s6
@@ -2146,8 +2146,8 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX9-NEXT: .LBB22_1: ; %bb1
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_lshl_b32 s6, s4, 3
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_lshr_b32 s6, 0x100, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_mov_b64 s[4:5], 1
; GFX9-NEXT: v_mov_b32_e32 v2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
index d86e5e6ec7bac6..c691b59f378038 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
@@ -57,7 +57,7 @@ body: |
; GFX9-LABEL: name: sgpr96_aligned_src_dst
; GFX9: liveins: $sgpr0_sgpr1_sgpr2
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr6 = S_MOV_B32 $sgpr2, implicit $sgpr0_sgpr1_sgpr2, implicit-def $sgpr4_sgpr5_sgpr6
+ ; GFX9-NEXT: $sgpr6 = S_MOV_B32 $sgpr2, implicit $sgpr0_sgpr1_sgpr2
; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2
$sgpr4_sgpr5_sgpr6 = COPY $sgpr0_sgpr1_sgpr2
...
@@ -70,7 +70,7 @@ body: |
; GFX9-LABEL: name: sgpr96_killed
; GFX9: liveins: $sgpr4_sgpr5_sgpr6
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr10 = S_MOV_B32 $sgpr6, implicit $sgpr4_sgpr5_sgpr6, implicit-def $sgpr8_sgpr9_sgpr10
+ ; GFX9-NEXT: $sgpr10 = S_MOV_B32 $sgpr6, implicit $sgpr4_sgpr5_sgpr6
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr4_sgpr5, implicit killed $sgpr4_sgpr5_sgpr6
$sgpr8_sgpr9_sgpr10 = COPY killed $sgpr4_sgpr5_sgpr6
...
@@ -83,7 +83,7 @@ body: |
; GFX9-LABEL: name: sgpr128_forward
; GFX9: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY $sgpr4_sgpr5_sgpr6_sgpr7
...
@@ -96,7 +96,7 @@ body: |
; GFX9-LABEL: name: sgpr128_backward
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
$sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
...
@@ -109,7 +109,7 @@ body: |
; GFX9-LABEL: name: sgpr128_killed
; GFX9: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7
...
@@ -122,7 +122,7 @@ body: |
; GFX9-LABEL: name: sgpr160_forward
; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
; GFX9-NEXT: $sgpr4 = S_MOV_B32 $sgpr12, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
@@ -136,7 +136,7 @@ body: |
; GFX9-LABEL: name: sgpr160_backward
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+ ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
$sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
@@ -150,7 +150,7 @@ body: |
; GFX9-LABEL: name: sgpr160_killed
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+ ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
$sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4
@@ -165,7 +165,7 @@ body: |
; GFX9-LABEL: name: sgpr192_forward
; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
@@ -179,7 +179,7 @@ body: |
; GFX9-LABEL: name: sgpr192_backward
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+ ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
$sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
@@ -193,7 +193,7 @@ body: |
; GFX9-LABEL: name: sgpr192_killed
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+ ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
$sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
@@ -207,7 +207,7 @@ body: |
; GFX9-LABEL: name: sgpr256_forward
; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -222,7 +222,7 @@ body: |
; GFX9-LABEL: name: sgpr256_backward
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -237,7 +237,7 @@ body: |
; GFX9-LABEL: name: sgpr256_killed
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -252,7 +252,7 @@ body: |
; GFX9-LABEL: name: sgpr512_forward
; GFX9: liveins: $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
@@ -271,7 +271,7 @@ body: |
; GFX9-LABEL: name: sgpr512_backward
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+ ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -290,7 +290,7 @@ body: |
; GFX9-LABEL: name: sgpr512_killed
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+ ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -309,7 +309,7 @@ body: |
; GFX9-LABEL: name: sgpr1024_forward
; GFX9: liveins: $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
@@ -336,7 +336,7 @@ body: |
; GFX9-LABEL: name: sgpr1024_backward
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+ ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
@@ -363,7 +363,7 @@ body: |
; GFX9-LABEL: name: sgpr1024_killed
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+ ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
index 47810346c50b7d..815ea68f791f2c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll
@@ -29,13 +29,13 @@ define amdgpu_kernel void @kernel() {
; GCN-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_mov_b32 s14, s10
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT: s_mov_b64 s[0:1], s[36:37]
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
+; GCN-NEXT: s_mov_b64 s[0:1], s[36:37]
; GCN-NEXT: s_mov_b64 s[2:3], s[38:39]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 37cf76103aa945..8fe0227b5d61b8 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -8,11 +8,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(ptr addrspace(1) %out, ptr add
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4
-; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64
; GCN-NEXT: v_mov_b32_e32 v1, 0
@@ -73,10 +73,10 @@ define amdgpu_kernel void @v_uextract_bit_95_i128(ptr addrspace(1) %out, ptr add
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 offset:8
; GCN-NEXT: v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
index b06739392e5075..bedf327715e50f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll
@@ -21352,10 +21352,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
; GFX900-NEXT: v_writelane_b32 v0, s50, 14
; GFX900-NEXT: v_writelane_b32 v0, s51, 15
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[4:19]
+; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[36:51]
+; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s46, s16
; GFX900-NEXT: s_mov_b32 s47, s17
@@ -21409,10 +21409,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() {
; GFX90A-NEXT: v_writelane_b32 v0, s50, 14
; GFX90A-NEXT: v_writelane_b32 v0, s51, 15
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[4:19]
+; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[36:51]
+; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s46, s16
; GFX90A-NEXT: s_mov_b32 s47, s17
@@ -21640,10 +21640,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX900-NEXT: v_writelane_b32 v0, s50, 14
; GFX900-NEXT: v_writelane_b32 v0, s51, 15
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[4:19]
+; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[36:51]
+; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s50, s16
; GFX900-NEXT: s_mov_b32 s51, s17
@@ -21697,10 +21697,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX90A-NEXT: v_writelane_b32 v0, s50, 14
; GFX90A-NEXT: v_writelane_b32 v0, s51, 15
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[4:19]
+; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[36:51]
+; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s50, s16
; GFX90A-NEXT: s_mov_b32 s51, s17
@@ -21740,10 +21740,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() {
; GFX940-NEXT: v_writelane_b32 v0, s30, 0
; GFX940-NEXT: v_writelane_b32 v0, s31, 1
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def s[0:15]
+; GFX940-NEXT: ; def s[16:31]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def s[16:31]
+; GFX940-NEXT: ; def s[0:15]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_mov_b32 s30, s12
; GFX940-NEXT: s_mov_b32 s31, s13
@@ -22572,10 +22572,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
; GFX900-NEXT: v_writelane_b32 v0, s50, 14
; GFX900-NEXT: v_writelane_b32 v0, s51, 15
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[4:19]
+; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[36:51]
+; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s46, s18
; GFX900-NEXT: s_mov_b32 s47, s19
@@ -22629,10 +22629,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() {
; GFX90A-NEXT: v_writelane_b32 v0, s50, 14
; GFX90A-NEXT: v_writelane_b32 v0, s51, 15
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[4:19]
+; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[36:51]
+; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s46, s18
; GFX90A-NEXT: s_mov_b32 s47, s19
@@ -22860,10 +22860,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX900-NEXT: v_writelane_b32 v0, s50, 14
; GFX900-NEXT: v_writelane_b32 v0, s51, 15
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[4:19]
+; GFX900-NEXT: ; def s[36:51]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; def s[36:51]
+; GFX900-NEXT: ; def s[4:19]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_mov_b32 s50, s18
; GFX900-NEXT: s_mov_b32 s51, s19
@@ -22917,10 +22917,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX90A-NEXT: v_writelane_b32 v0, s50, 14
; GFX90A-NEXT: v_writelane_b32 v0, s51, 15
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[4:19]
+; GFX90A-NEXT: ; def s[36:51]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def s[36:51]
+; GFX90A-NEXT: ; def s[4:19]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b32 s50, s18
; GFX90A-NEXT: s_mov_b32 s51, s19
@@ -22960,10 +22960,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() {
; GFX940-NEXT: v_writelane_b32 v0, s30, 0
; GFX940-NEXT: v_writelane_b32 v0, s31, 1
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def s[0:15]
+; GFX940-NEXT: ; def s[16:31]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def s[16:31]
+; GFX940-NEXT: ; def s[0:15]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_mov_b32 s30, s14
; GFX940-NEXT: s_mov_b32 s31, s15
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 59d7fe107ee537..ed883f81bd0bf5 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
@@ -56,13 +56,13 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; GFX9-NEXT: s_getpc_b64 s[18:19]
; GFX9-NEXT: s_add_u32 s18, s18, indirect at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s19, s19, indirect at rel32@hi+12
+; GFX9-NEXT: v_mov_b32_e32 v4, s19
; GFX9-NEXT: s_mov_b32 s14, s16
; GFX9-NEXT: v_mad_u32_u24 v3, v1, s15, v3
; GFX9-NEXT: v_add_lshl_u32 v5, v3, v2, 3
-; GFX9-NEXT: v_mov_b32_e32 v3, s18
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, s19
+; GFX9-NEXT: v_mov_b32_e32 v3, s18
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: ds_write_b64 v5, v[3:4]
; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 6b40df0345ebe3..d74e6278c3339d 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -42,8 +42,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -56,8 +56,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -236,8 +236,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -250,8 +250,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -287,8 +287,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -301,8 +301,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -357,8 +357,8 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -371,8 +371,8 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
index ae166212fe79de..52cc9efc326d40 100644
--- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
@@ -7,9 +7,9 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1)
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: v_mov_b32_e32 v2, v0
; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX940-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 71033cfd1a6f34..000b79e0c615bc 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -359,9 +359,9 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
; VI-NEXT: s_or_b32 s2, s4, s2
; VI-NEXT: s_add_i32 s3, s3, 0x20000
; VI-NEXT: s_add_i32 s2, s2, 0x20000
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e2bcf3f6a2e2cd..272a10aaa29561 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10405,14 +10405,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_nop 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
@@ -10420,7 +10420,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v0
@@ -10554,89 +10553,89 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v56
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v60
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v34
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v36
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v10
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v40
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v44
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v48
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v22
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v52
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v26
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v30
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v12
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v16
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v20
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v24
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v30
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v28
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v54
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v34
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v36
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v54
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v50
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v51
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v46
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v47
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v42
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v43
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v38
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v34
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v36
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v58
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v59
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60
@@ -10648,26 +10647,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v89
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v90
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v91
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v92
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v86
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v87
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v88
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v82
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v83
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v84
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v78
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v79
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v80
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v74
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v75
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v76
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v70
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v71
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v72
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 6423267be4b34f..bb64d36395d48a 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -3350,10 +3350,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mov_b32_e32 v8, 0
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_add_u32 s0, s6, 16
-; TONGA-NEXT: v_mov_b32_e32 v4, s6
; TONGA-NEXT: s_addc_u32 s1, s7, 0
-; TONGA-NEXT: v_mov_b32_e32 v0, s0
+; TONGA-NEXT: v_mov_b32_e32 v4, s6
; TONGA-NEXT: v_mov_b32_e32 v5, s7
+; TONGA-NEXT: v_mov_b32_e32 v0, s0
; TONGA-NEXT: v_mov_b32_e32 v1, s1
; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -6094,11 +6094,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_add_u32 s0, s6, 48
; TONGA-NEXT: s_addc_u32 s1, s7, 0
; TONGA-NEXT: s_add_u32 s2, s6, 32
-; TONGA-NEXT: v_mov_b32_e32 v0, s6
; TONGA-NEXT: s_addc_u32 s3, s7, 0
; TONGA-NEXT: v_mov_b32_e32 v2, s2
-; TONGA-NEXT: v_mov_b32_e32 v1, s7
; TONGA-NEXT: v_mov_b32_e32 v3, s3
+; TONGA-NEXT: v_mov_b32_e32 v0, s6
+; TONGA-NEXT: v_mov_b32_e32 v1, s7
; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[2:3]
; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
; TONGA-NEXT: v_mov_b32_e32 v0, s0
@@ -6634,11 +6634,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; TONGA-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc
; TONGA-NEXT: .LBB12_12:
+; TONGA-NEXT: s_add_u32 s0, s4, 16
; TONGA-NEXT: v_mov_b32_e32 v0, s4
; TONGA-NEXT: v_mov_b32_e32 v1, s5
-; TONGA-NEXT: s_add_u32 s0, s4, 16
-; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; TONGA-NEXT: s_addc_u32 s1, s5, 0
+; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; TONGA-NEXT: v_mov_b32_e32 v0, s0
; TONGA-NEXT: v_mov_b32_e32 v1, s1
; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 23364e860d1542..3e5b67782dc408 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -394,22 +394,22 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
+; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
@@ -1137,9 +1137,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s4, s6, s4
; GCN-IR-NEXT: s_subb_u32 s5, s7, s5
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: v_mov_b32_e32 v1, s5
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
@@ -1575,22 +1575,22 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
@@ -1768,22 +1768,22 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
@@ -1882,11 +1882,11 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 5641c43c40084c..5732ebfa7fcbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -16,11 +16,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF-NEXT: s_mov_b32 s38, -1
; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000
; MUBUF-NEXT: s_add_u32 s36, s36, s11
-; MUBUF-NEXT: s_addc_u32 s37, s37, 0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000
; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
; MUBUF-NEXT: v_mov_b32_e32 v3, 0
; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000
+; MUBUF-NEXT: s_addc_u32 s37, s37, 0
+; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39]
; MUBUF-NEXT: s_mov_b32 s32, 0xc0000
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes at rel32@lo+4
@@ -28,7 +29,6 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v0, s0
; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37]
-; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39]
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index 8f16fcf6d08906..4dd1d8bea5bfeb 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -855,13 +855,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200
+; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42
+; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17
; WAVE32-OPT-NEXT: s_mov_b32 s13, s9
+; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; WAVE32-OPT-NEXT: s_mov_b32 s12, s8
; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[4:5]
; WAVE32-OPT-NEXT: s_mov_b32 s4, s32
-; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42
-; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17
-; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; WAVE32-OPT-NEXT: s_mov_b32 s14, s10
; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument at abs32@hi
; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument at abs32@lo
@@ -892,13 +892,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400
+; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42
+; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17
; WAVE64-OPT-NEXT: s_mov_b32 s13, s9
+; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; WAVE64-OPT-NEXT: s_mov_b32 s12, s8
; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[4:5]
; WAVE64-OPT-NEXT: s_mov_b32 s4, s32
-; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42
-; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17
-; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; WAVE64-OPT-NEXT: s_mov_b32 s14, s10
; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument at abs32@hi
; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument at abs32@lo
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index 76ed4f6238dbed..5539a1ec38af53 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -67,9 +67,9 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x)
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, s6
+; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: ds_store_b128 v4, v[0:3]
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, ptr addrspace(3) %out
@@ -520,8 +520,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-NEXT: s_endpgm
@@ -534,8 +534,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX7-NEXT: s_endpgm
@@ -574,10 +574,9 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, s6
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, ptr addrspace(3) %out, align 8
@@ -646,9 +645,9 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, s6
+; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: ds_store_b128 v4, v[0:3]
; GFX11-NEXT: s_endpgm
store <4 x i32> %x, ptr addrspace(3) %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f791135d45e9aa..789fd6b4bc858f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -261,8 +261,8 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b8 v2, v3 offset:8
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 001c35ef30cc65..da5d847e3eeb27 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -650,8 +650,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX8-NEXT: s_sub_u32 s0, s0, s2
; GFX8-NEXT: s_subb_u32 s1, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
@@ -676,8 +676,9 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_endpgm
%result = sub i64 %a, %b
@@ -940,8 +941,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v10, v14
; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v12
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index d4329aec2021c0..cce79417777cce 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -12,18 +12,18 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) #
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc
; CHECK-NEXT: ; %bb.1: ; %ift
; CHECK-NEXT: s_mov_b32 s4, s5
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v2, s6
; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: ; %bb.2: ; %ife
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 9189cef019cf40..8b7d118e8a3950 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -23,8 +23,8 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s1, s2
; CHECK-NEXT: s_mov_b32 s2, 0
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: s_mov_b32 s3, 0x40260000
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index b8f0d7617167e0..24e977bb428e0b 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -129,8 +129,8 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-TRAP-GFX803-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
; HSA-TRAP-GFX803-NEXT: s_cbranch_vccz .LBB1_2
; HSA-TRAP-GFX803-NEXT: ; %bb.1: ; %ret
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
index 5f01db82ccd48d..928f16f925cf5d 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
@@ -94,11 +94,11 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out,
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_and_b32 s6, s6, 0xffff
; VI-NEXT: s_or_b32 s5, s6, s5
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -201,11 +201,11 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out,
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_or_b32 s5, s7, s5
; VI-NEXT: s_or_b32 s0, s0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s34
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s34
; VI-NEXT: v_mov_b32_e32 v5, s35
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 6606b1d050421c..ad0bfab951de2d 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -32,8 +32,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s0, s2, s4
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_addc_u32 s1, s3, s5
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
@@ -329,12 +329,12 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -352,8 +352,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index db7d816386a70d..4a7f53aacd65df 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -186,9 +186,9 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3]
; GCN-IR-NEXT: .LBB0_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: v_mov_b32_e32 v1, s9
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
@@ -360,22 +360,22 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
@@ -949,9 +949,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1
; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
; GCN-IR-NEXT: .LBB8_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
; GCN-IR-NEXT: v_mov_b32_e32 v1, s7
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
@@ -1107,22 +1107,22 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
@@ -1204,11 +1204,11 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3
@@ -1314,9 +1314,9 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1
; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
; GCN-IR-NEXT: .LBB11_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
; GCN-IR-NEXT: v_mov_b32_e32 v1, s7
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
@@ -1403,11 +1403,11 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index a56346f3bb45bc..e806182e0721b5 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -218,6 +218,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX8-NEXT: s_sub_i32 s6, 0, s2
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -240,7 +241,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s0, s6, s0
; GFX8-NEXT: s_sub_i32 s2, 0, s3
; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 55cbc14a467068..e20af85bb53ef4 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -144,9 +144,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; SI-NEXT: v_ldexp_f64 v[6:7], v[8:9], 32
; SI-NEXT: v_ldexp_f64 v[8:9], v[10:11], 32
; SI-NEXT: s_add_u32 s0, s8, 16
-; SI-NEXT: s_addc_u32 s1, s9, 0
; SI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5]
; SI-NEXT: v_add_f64 v[4:5], v[8:9], v[12:13]
+; SI-NEXT: s_addc_u32 s1, s9, 0
; SI-NEXT: v_mov_b32_e32 v9, s1
; SI-NEXT: v_mov_b32_e32 v8, s0
; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
@@ -180,8 +180,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; VI-NEXT: s_add_u32 s0, s8, 16
; VI-NEXT: s_addc_u32 s1, s9, 0
; VI-NEXT: v_mov_b32_e32 v11, s1
-; VI-NEXT: v_mov_b32_e32 v8, s8
; VI-NEXT: v_mov_b32_e32 v10, s0
+; VI-NEXT: v_mov_b32_e32 v8, s8
; VI-NEXT: v_mov_b32_e32 v9, s9
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -240,11 +240,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2
-; SI-NEXT: s_add_u32 s0, s4, 16
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
+; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; SI-NEXT: s_add_u32 s0, s4, 16
; SI-NEXT: s_addc_u32 s1, s5, 0
; SI-NEXT: v_mov_b32_e32 v9, s1
; SI-NEXT: v_mov_b32_e32 v8, s0
@@ -260,11 +260,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2
-; VI-NEXT: s_add_u32 s0, s4, 16
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
+; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; VI-NEXT: s_add_u32 s0, s4, 16
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v9, s1
; VI-NEXT: v_mov_b32_e32 v8, s0
@@ -290,8 +290,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
-; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -304,8 +304,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -406,8 +406,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
-; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -420,8 +420,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -457,8 +457,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
-; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -471,8 +471,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -509,8 +509,8 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
-; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
@@ -523,8 +523,8 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index a794d139063d5f..82b6dac3c4138b 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -369,22 +369,22 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
+; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
@@ -1208,22 +1208,22 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3
@@ -1311,11 +1311,11 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 2f4f08175be0ed..b1ee99350f31de 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -33,8 +33,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s0, s2, s4
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_subb_u32 s1, s3, s5
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
@@ -329,12 +329,12 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -352,8 +352,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 5360ff2fa402f1..b675ea78439906 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -46,10 +46,10 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -2317,10 +2317,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -2429,11 +2429,11 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 934d9efba46564..58d8aecc9e714a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -189,8 +189,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
; GISEL-VI-NEXT: flat_store_dword v[0:1], v2
; GISEL-VI-NEXT: s_endpgm
@@ -626,8 +626,8 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
; GISEL-VI-NEXT: flat_store_dword v[0:1], v2
; GISEL-VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b85bd4c6346684..79c55cfceec553 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1999,10 +1999,10 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_endpgm
%ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b5e4bcd049c42a..02c0a7f9be9192 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -485,9 +485,9 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX906-NEXT: s_cbranch_execz .LBB9_4
; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v1, v3
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: v_mov_b32_e32 v2, v4
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
; GFX906-NEXT: .LBB9_4: ; %bb.3
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 0307472fce7327..2cb46d66d31de8 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -880,8 +880,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_cmp_ge_u32 s1, s4
; GFX1032-NEXT: s_mov_b32 s1, 0
; GFX1032-NEXT: s_cselect_b32 s0, s5, s0
-; GFX1032-NEXT: v_mov_b32_e32 v0, s0
; GFX1032-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032-NEXT: v_mov_b32_e32 v0, s0
; GFX1032-NEXT: .LBB15_3:
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
@@ -1041,8 +1041,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_cmp_ge_u32 s1, s4
; GFX1064-NEXT: s_mov_b32 s1, 0
; GFX1064-NEXT: s_cselect_b32 s0, s5, s0
-; GFX1064-NEXT: v_mov_b32_e32 v0, s0
; GFX1064-NEXT: v_mov_b32_e32 v1, s1
+; GFX1064-NEXT: v_mov_b32_e32 v0, s0
; GFX1064-NEXT: .LBB15_3:
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 82fae44e208186..6afa4627f7e3e6 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -530,8 +530,8 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
; SI-NEXT: s_and_b32 s1, s1, 1
; SI-NEXT: s_add_u32 s4, s1, 0x3e7
; SI-NEXT: s_addc_u32 s5, 0, 0
-; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index deab4075818805..4dffac371d1efb 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1943,9 +1943,9 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4
; GFX9-W64-NEXT: .LBB35_2: ; %loop
; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8
; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6
; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 08cc2e4ec7d794..fb8811daf9542e 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -417,16 +417,16 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6
; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT: s_add_u32 s8, s4, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21]
-; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-O3-NEXT: s_add_u32 s8, s4, 56
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
+; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: s_getpc_b64 s[22:23]
@@ -690,8 +690,8 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6
@@ -1269,16 +1269,16 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6
; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT: s_add_u32 s8, s4, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21]
-; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-O3-NEXT: s_add_u32 s8, s4, 56
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
+; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: s_getpc_b64 s[22:23]
@@ -1542,8 +1542,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 8c9dac781d5da1..500e3b0435df91 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -610,9 +610,9 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_xor_b32 s1, s1, 0xf237b
; VI-NEXT: s_xor_b32 s0, s0, 0x3039
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -652,13 +652,13 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-NEXT: s_mov_b32 s7, 0xf237b
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s2, 0x3039
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_addc_u32 s1, s3, 0xf237b
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
@@ -692,9 +692,9 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_xor_b32 s0, s0, 63
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -724,8 +724,8 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
More information about the llvm-commits
mailing list