[llvm] [AMDGPU] Remove implicit defs on expanded mov64 pseudos (PR #190379)

Joe Nash via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 3 11:15:23 PDT 2026


https://github.com/Sisyph updated https://github.com/llvm/llvm-project/pull/190379

>From 7364902fae0bae6b29f9d1eeb05d0195551b004d Mon Sep 17 00:00:00 2001
From: Joseph Nash <joseph.nash at amd.com>
Date: Thu, 2 Apr 2026 18:47:35 -0400
Subject: [PATCH 1/3] [AMDGPU] Fix implicit defs on mov64 pseudos

The mov64 pseudo is split into two 32 bit movs, but those 32 bit movs
had the full 64-bit register still implicitly defined. VOPD formation is
affected, so we can emit more of them.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  16 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll  |   6 +-
 .../AMDGPU/GlobalISel/atomicrmw_minmax.ll     |  60 +-
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll  | 160 +++--
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll  | 256 ++++----
 .../GlobalISel/call-outgoing-stack-args.ll    |   2 +-
 .../AMDGPU/GlobalISel/combine-short-clamp.ll  |  15 +-
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        |   8 +-
 .../AMDGPU/GlobalISel/extractelement.ll       |  52 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  18 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll   |   4 +-
 .../AMDGPU/GlobalISel/insertelement.i16.ll    | 321 +++++-----
 .../AMDGPU/GlobalISel/insertelement.i8.ll     | 228 ++++---
 .../AMDGPU/GlobalISel/insertelement.ll        |  28 +-
 .../GlobalISel/llvm.amdgcn.rsq.clamp.ll       |  64 +-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  20 +-
 .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll |  28 +-
 .../AMDGPU/GlobalISel/mul-known-bits.i64.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll   |   6 +-
 .../abi-attribute-hints-undefined-behavior.ll |   2 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |   4 +-
 .../atomic_optimizations_global_pointer.ll    |  46 +-
 .../atomic_optimizations_local_pointer.ll     |  69 +--
 .../AMDGPU/av_movimm_pseudo_expansion.mir     |  76 +--
 .../CodeGen/AMDGPU/calling-conventions.ll     | 143 ++---
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |  20 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |  24 +-
 .../CodeGen/AMDGPU/dag-divergence-atomic.ll   |   4 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |  18 +-
 llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll     |  21 +-
 .../expand-scalar-carry-out-select-user.ll    |   8 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i8.ll   |   4 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     | 444 ++++++--------
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll |  12 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |   8 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |  26 +-
 llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll |  18 +-
 llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll |  22 +-
 llvm/test/CodeGen/AMDGPU/fptoui-sat-scalar.ll |  14 +-
 llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll |  14 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 133 ++---
 .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll   |   8 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll   |  24 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll   |  24 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll   |   8 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 114 ++--
 .../AMDGPU/global_atomics_scan_fmax.ll        |  63 +-
 .../AMDGPU/global_atomics_scan_fmin.ll        |  63 +-
 .../AMDGPU/global_atomics_scan_fsub.ll        | 114 ++--
 .../CodeGen/AMDGPU/inflate-av-remat-imm.mir   |  20 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll  |  44 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll  |  44 +-
 .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll |  12 +-
 .../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll |   6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll  |   8 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll    |  12 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |   4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |   4 +-
 .../llvm.amdgcn.set.inactive.chain.arg.ll     |  10 +-
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        |   4 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.f64.ll      | 368 ++++++------
 llvm/test/CodeGen/AMDGPU/llvm.exp10.f64.ll    | 558 +++++++++---------
 llvm/test/CodeGen/AMDGPU/llvm.exp2.f64.ll     | 508 ++++++++--------
 llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll |  30 +-
 .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll    |  24 +-
 .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll    |  24 +-
 llvm/test/CodeGen/AMDGPU/lrint.ll             |   4 +-
 llvm/test/CodeGen/AMDGPU/lround.ll            |   8 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |   2 +-
 llvm/test/CodeGen/AMDGPU/memset-pattern.ll    |  16 +-
 .../AMDGPU/misaligned-vgpr-regsequence.mir    |   2 +-
 .../CodeGen/AMDGPU/move-to-valu-lshl_add.ll   |   3 +-
 .../CodeGen/AMDGPU/offset-split-global.ll     |  24 +-
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  12 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |   4 +-
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |   8 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |  12 +-
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |   2 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |  14 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |   8 +-
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |  12 +-
 .../siloadstoreopt-misaligned-regsequence.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |   8 +-
 llvm/test/CodeGen/AMDGPU/swdev380865.ll       |   4 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  12 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |   2 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |   8 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |   6 +-
 .../AMDGPU/v_mov_b64_expand_and_shrink.mir    |   4 +-
 .../CodeGen/AMDGPU/v_mov_b64_expansion.mir    |  40 +-
 .../AMDGPU/vgpr-mark-last-scratch-load.ll     |  37 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   8 +-
 llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll  |  57 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |   4 +-
 96 files changed, 2271 insertions(+), 2586 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index bafa6cd800b5e..ab044f2542e9a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2163,10 +2163,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
           .addImm(SignExtend64<32>(Imm))
-          .addReg(Dst, RegState::Implicit | RegState::Define);
+          .addReg(DstLo, RegState::Implicit | RegState::Define);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
           .addImm(SignExtend64<32>(Imm >> 32))
-          .addReg(Dst, RegState::Implicit | RegState::Define);
+          .addReg(DstHi, RegState::Implicit | RegState::Define);
       MI.eraseFromParent();
       break;
     }
@@ -2212,10 +2212,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
           .addImm(Lo.getSExtValue())
-          .addReg(Dst, RegState::Implicit | RegState::Define);
+          .addReg(DstLo, RegState::Implicit | RegState::Define);
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
           .addImm(Hi.getSExtValue())
-          .addReg(Dst, RegState::Implicit | RegState::Define);
+          .addReg(DstHi, RegState::Implicit | RegState::Define);
       }
     } else {
       assert(SrcOp.isReg());
@@ -2234,10 +2234,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
-          .addReg(Dst, RegState::Implicit | RegState::Define);
+          .addReg(DstLo, RegState::Implicit | RegState::Define);
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
-          .addReg(Dst, RegState::Implicit | RegState::Define);
+          .addReg(DstHi, RegState::Implicit | RegState::Define);
       }
     }
     MI.eraseFromParent();
@@ -2270,10 +2270,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     APInt Hi(32, Imm.getHiBits(32).getZExtValue());
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
       .addImm(Lo.getSExtValue())
-      .addReg(Dst, RegState::Implicit | RegState::Define);
+      .addReg(DstLo, RegState::Implicit | RegState::Define);
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
       .addImm(Hi.getSExtValue())
-      .addReg(Dst, RegState::Implicit | RegState::Define);
+      .addReg(DstHi, RegState::Implicit | RegState::Define);
     MI.eraseFromParent();
     break;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index a20387b17c53d..68d15b91932ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -658,10 +658,10 @@ define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1
 ; GFX6-LABEL: s_andn2_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s7
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -711,10 +711,10 @@ define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inr
 ; GFX6-LABEL: s_andn2_v3i16_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s7
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -764,10 +764,10 @@ define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3
 ; GFX6-LABEL: s_andn2_v3i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s7
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_minmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_minmax.ll
index 2b70d83c20330..d233423e69135 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_minmax.ll
@@ -91,8 +91,8 @@ define amdgpu_kernel void @global_atomic_min_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: global_atomic_min_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_min_i64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -164,8 +164,8 @@ define amdgpu_kernel void @flat_atomic_min_ret_i64(ptr addrspace(1) %out, ptr %p
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -200,10 +200,9 @@ define amdgpu_kernel void @flat_atomic_min_ret_i64(ptr addrspace(1) %out, ptr %p
 ; GFX11-LABEL: flat_atomic_min_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -297,9 +296,9 @@ define amdgpu_kernel void @local_atomic_min_ret_i64(ptr addrspace(1) %out, ptr a
 ; GFX11-LABEL: local_atomic_min_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -390,8 +389,8 @@ define amdgpu_kernel void @global_atomic_max_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: global_atomic_max_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_max_i64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -463,8 +462,8 @@ define amdgpu_kernel void @flat_atomic_max_ret_i64(ptr addrspace(1) %out, ptr %p
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -499,10 +498,9 @@ define amdgpu_kernel void @flat_atomic_max_ret_i64(ptr addrspace(1) %out, ptr %p
 ; GFX11-LABEL: flat_atomic_max_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -596,9 +594,9 @@ define amdgpu_kernel void @local_atomic_max_ret_i64(ptr addrspace(1) %out, ptr a
 ; GFX11-LABEL: local_atomic_max_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -689,8 +687,8 @@ define amdgpu_kernel void @global_atomic_umin_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: global_atomic_umin_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_min_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -762,8 +760,8 @@ define amdgpu_kernel void @flat_atomic_umin_ret_i64(ptr addrspace(1) %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -798,10 +796,9 @@ define amdgpu_kernel void @flat_atomic_umin_ret_i64(ptr addrspace(1) %out, ptr %
 ; GFX11-LABEL: flat_atomic_umin_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -895,9 +892,9 @@ define amdgpu_kernel void @local_atomic_umin_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: local_atomic_umin_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
@@ -988,8 +985,8 @@ define amdgpu_kernel void @global_atomic_umax_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: global_atomic_umax_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_max_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1061,8 +1058,8 @@ define amdgpu_kernel void @flat_atomic_umax_ret_i64(ptr addrspace(1) %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1097,10 +1094,9 @@ define amdgpu_kernel void @flat_atomic_umax_ret_i64(ptr addrspace(1) %out, ptr %
 ; GFX11-LABEL: flat_atomic_umax_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1194,9 +1190,9 @@ define amdgpu_kernel void @local_atomic_umax_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: local_atomic_umax_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index df3693620bdb7..f0c8dae40280a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -1716,9 +1716,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1740,9 +1740,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1764,8 +1764,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1802,10 +1802,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-LABEL: flat_atomic_dec_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1824,8 +1823,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -1850,8 +1849,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -1875,8 +1874,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1915,10 +1914,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -1937,9 +1935,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1953,9 +1951,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1969,8 +1967,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2002,8 +2000,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX11-LABEL: flat_atomic_dec_noret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1]
@@ -2022,8 +2019,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -2040,8 +2037,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -2057,8 +2054,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2092,8 +2089,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
@@ -2113,8 +2109,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -2131,8 +2127,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -2148,8 +2144,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2183,8 +2179,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
@@ -2211,8 +2206,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -2240,8 +2235,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -2262,8 +2257,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
@@ -2312,9 +2307,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -2352,8 +2347,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -2373,8 +2368,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -2387,8 +2382,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -2428,9 +2423,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -2618,9 +2613,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX11-LABEL: lds_atomic_dec_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
@@ -2706,9 +2701,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX11-LABEL: lds_atomic_dec_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
@@ -2774,9 +2769,9 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 {
 ; GFX11-LABEL: lds_atomic_dec_noret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    ds_dec_u64 v2, v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2836,9 +2831,9 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr
 ; GFX11-LABEL: lds_atomic_dec_noret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    ds_dec_u64 v2, v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2853,9 +2848,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -2872,9 +2867,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -2917,8 +2912,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: global_atomic_dec_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2937,8 +2932,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -2958,8 +2953,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3004,8 +2999,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: global_atomic_dec_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3025,8 +3020,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3046,8 +3041,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3092,8 +3087,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3112,9 +3107,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -3128,9 +3123,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -3168,8 +3163,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-LABEL: global_atomic_dec_noret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3186,8 +3181,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3204,8 +3199,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3245,8 +3240,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3264,8 +3259,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3282,8 +3277,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3323,8 +3318,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3349,8 +3344,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -3375,8 +3370,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
@@ -3419,10 +3414,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3452,8 +3447,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -3473,8 +3468,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -3510,10 +3505,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:40
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3531,8 +3526,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3553,8 +3548,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3574,8 +3569,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3605,11 +3600,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX11-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 9
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 7b1178ca4da8a..62caf7f406325 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -1257,9 +1257,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX11-LABEL: lds_atomic_inc_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
@@ -1272,9 +1272,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX12-LABEL: lds_atomic_inc_ret_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
@@ -1358,9 +1358,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX11-LABEL: lds_atomic_inc_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
@@ -1373,9 +1373,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX12-LABEL: lds_atomic_inc_ret_i64_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
@@ -1439,9 +1439,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 {
 ; GFX11-LABEL: lds_atomic_inc_noret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    ds_inc_u64 v2, v[0:1]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1450,9 +1450,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 {
 ; GFX12-LABEL: lds_atomic_inc_noret_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    ds_inc_u64 v2, v[0:1]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
@@ -1512,9 +1512,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
 ; GFX11-LABEL: lds_atomic_inc_noret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    ds_inc_u64 v2, v[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1523,9 +1523,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
 ; GFX12-LABEL: lds_atomic_inc_noret_i64_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    ds_inc_u64 v2, v[0:1] offset:32
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
@@ -1540,9 +1540,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1559,9 +1559,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1604,8 +1604,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-LABEL: global_atomic_inc_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1617,8 +1617,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX12-LABEL: global_atomic_inc_ret_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1636,8 +1636,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -1657,8 +1657,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -1703,8 +1703,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-LABEL: global_atomic_inc_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1716,8 +1716,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX12-LABEL: global_atomic_inc_ret_i64_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1736,8 +1736,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -1757,8 +1757,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -1803,8 +1803,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1816,8 +1816,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX12-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -1837,9 +1837,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1853,9 +1853,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1893,8 +1893,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-LABEL: global_atomic_inc_noret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1905,8 +1905,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX12-LABEL: global_atomic_inc_noret_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
@@ -1922,8 +1922,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -1940,8 +1940,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -1981,8 +1981,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1993,8 +1993,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX12-LABEL: global_atomic_inc_noret_i64_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
@@ -2011,8 +2011,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -2029,8 +2029,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -2070,8 +2070,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2082,8 +2082,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; GFX12-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -2109,8 +2109,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -2135,8 +2135,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -2179,10 +2179,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2194,10 +2194,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX12-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2226,8 +2226,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
@@ -2247,8 +2247,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
@@ -2284,10 +2284,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2298,10 +2298,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; GFX12-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
@@ -3210,8 +3210,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3232,8 +3232,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3253,8 +3253,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3284,11 +3284,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 9
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3303,11 +3302,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX12-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 9
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
 ; GFX12-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX12-NEXT:    ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16
@@ -3333,9 +3331,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -3357,9 +3355,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -3381,8 +3379,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3419,10 +3417,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-LABEL: flat_atomic_inc_ret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -3434,10 +3431,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX12-LABEL: flat_atomic_inc_ret_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
@@ -3455,8 +3451,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3481,8 +3477,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3506,8 +3502,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3546,10 +3542,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -3561,10 +3556,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX12-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
@@ -3583,8 +3577,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3609,8 +3603,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
@@ -3634,8 +3628,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3674,10 +3668,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
@@ -3689,10 +3682,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
@@ -3712,9 +3704,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
@@ -3728,9 +3720,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -3744,8 +3736,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3777,8 +3769,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX11-LABEL: flat_atomic_inc_noret_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1]
@@ -3791,8 +3782,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX12-LABEL: flat_atomic_inc_noret_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV
@@ -3809,8 +3799,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3827,8 +3817,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3844,8 +3834,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3879,8 +3869,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
@@ -3893,8 +3882,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX12-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
@@ -3912,8 +3900,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CI-NEXT:    s_add_i32 s12, s12, s17
 ; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3930,8 +3918,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
@@ -3947,8 +3935,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3982,8 +3970,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
@@ -3996,8 +3983,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, 42
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
@@ -4024,8 +4010,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -4053,8 +4039,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
@@ -4075,8 +4061,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
@@ -4125,9 +4111,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -4148,9 +4134,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 42
+; GFX12-NEXT:    v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v3, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -4188,8 +4174,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
@@ -4209,8 +4195,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
@@ -4223,8 +4209,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -4264,9 +4250,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -4283,9 +4269,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 42
+; GFX12-NEXT:    v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v3, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index c16c8e2128c72..952e23168f6bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -156,8 +156,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-LABEL: kernel_caller_byval:
 ; FLATSCR:       ; %bb.0:
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
index 567be78f19614..4fc8fbc0eeea6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
@@ -99,9 +99,9 @@ define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
 ; GFX678-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX678-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX678-NEXT:    v_mov_b32_e32 v4, 0x8001
-; GFX678-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX678-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX678-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX678-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX678-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX678-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX678-NEXT:    v_mov_b32_e32 v4, 0xffff8000
@@ -115,9 +115,9 @@ define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x8001
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff8000
@@ -140,8 +140,8 @@ define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0x8001, v[0:1]
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_cndmask_b32 v0, 0x8001, v0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0xffff8000 :: v_dual_mov_b32 v3, -1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x8001, v0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
@@ -162,9 +162,9 @@ define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 {
 ; GFX678-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX678-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX678-NEXT:    v_mov_b32_e32 v4, 0xffff7fff
-; GFX678-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GFX678-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX678-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
+; GFX678-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GFX678-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX678-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX678-NEXT:    v_mov_b32_e32 v4, 0x8000
@@ -178,9 +178,9 @@ define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff7fff
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x8000
@@ -202,8 +202,7 @@ define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 {
 ; GFX11-LABEL: v_clamp_i64_i16_invalid_lower_and_higher:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff7fff
-; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0xffff7fff :: v_dual_mov_b32 v3, -1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xffff7fff, v0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 92fd4466cf8a5..de586a5921724 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1435,10 +1435,10 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, 0xff
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; SI-NEXT:    v_ffbh_i32_e32 v0, 0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    v_min_u32_e32 v3, 32, v0
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[1:2], v3
 ; SI-NEXT:    v_min_u32_e32 v0, 1, v0
@@ -1452,10 +1452,10 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xff
-; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; VI-NEXT:    v_ffbh_i32_e32 v0, 0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_min_u32_e32 v3, 32, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v3, v[1:2]
 ; VI-NEXT:    v_min_u32_e32 v0, 1, v0
@@ -1474,9 +1474,9 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, 0xff
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    v_min_u32_e32 v3, 32, v0
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[1:2], v3
 ; SI-NEXT:    v_min_u32_e32 v0, 1, v0
@@ -1490,9 +1490,9 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xff
-; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    v_min_u32_e32 v3, 32, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v3, v[1:2]
 ; VI-NEXT:    v_min_u32_e32 v0, 1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 206011adf0213..6d434ddcdb098 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2023,34 +2023,34 @@ entry:
 define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v16f64_s_s:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s66, 0
-; GCN-NEXT:    s_mov_b32 s64, 0
-; GCN-NEXT:    s_mov_b32 s62, 0
-; GCN-NEXT:    s_mov_b32 s60, 0
-; GCN-NEXT:    s_mov_b32 s58, 0
-; GCN-NEXT:    s_mov_b32 s56, 0
-; GCN-NEXT:    s_mov_b32 s54, 0
-; GCN-NEXT:    s_mov_b32 s52, 0
-; GCN-NEXT:    s_mov_b32 s50, 0
-; GCN-NEXT:    s_mov_b32 s48, 0
-; GCN-NEXT:    s_mov_b32 s46, 0
-; GCN-NEXT:    s_mov_b32 s44, 0
-; GCN-NEXT:    s_mov_b32 s40, 0
 ; GCN-NEXT:    s_mov_b64 s[36:37], 1.0
 ; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    s_mov_b32 s66, 0
 ; GCN-NEXT:    s_mov_b32 s67, 0x40300000
+; GCN-NEXT:    s_mov_b32 s64, 0
 ; GCN-NEXT:    s_mov_b32 s65, 0x402e0000
+; GCN-NEXT:    s_mov_b32 s62, 0
 ; GCN-NEXT:    s_mov_b32 s63, 0x402c0000
+; GCN-NEXT:    s_mov_b32 s60, 0
 ; GCN-NEXT:    s_mov_b32 s61, 0x402a0000
+; GCN-NEXT:    s_mov_b32 s58, 0
 ; GCN-NEXT:    s_mov_b32 s59, 0x40280000
+; GCN-NEXT:    s_mov_b32 s56, 0
 ; GCN-NEXT:    s_mov_b32 s57, 0x40260000
+; GCN-NEXT:    s_mov_b32 s54, 0
 ; GCN-NEXT:    s_mov_b32 s55, 0x40240000
+; GCN-NEXT:    s_mov_b32 s52, 0
 ; GCN-NEXT:    s_mov_b32 s53, 0x40220000
+; GCN-NEXT:    s_mov_b32 s50, 0
 ; GCN-NEXT:    s_mov_b32 s51, 0x40200000
+; GCN-NEXT:    s_mov_b32 s48, 0
 ; GCN-NEXT:    s_mov_b32 s49, 0x401c0000
+; GCN-NEXT:    s_mov_b32 s46, 0
 ; GCN-NEXT:    s_mov_b32 s47, 0x40180000
+; GCN-NEXT:    s_mov_b32 s44, 0
 ; GCN-NEXT:    s_mov_b32 s45, 0x40140000
 ; GCN-NEXT:    s_mov_b64 s[42:43], 4.0
+; GCN-NEXT:    s_mov_b32 s40, 0
 ; GCN-NEXT:    s_mov_b32 s41, 0x40080000
 ; GCN-NEXT:    s_mov_b64 s[38:39], 2.0
 ; GCN-NEXT:    s_movrels_b64 s[0:1], s[36:37]
@@ -2061,31 +2061,31 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
 ; GFX10PLUS-NEXT:    s_mov_b64 s[36:37], 1.0
 ; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
 ; GFX10PLUS-NEXT:    s_mov_b32 s66, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s64, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s62, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s60, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s58, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s56, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s54, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s52, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s50, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s48, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s46, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s44, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s40, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s67, 0x40300000
+; GFX10PLUS-NEXT:    s_mov_b32 s64, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s65, 0x402e0000
+; GFX10PLUS-NEXT:    s_mov_b32 s62, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s63, 0x402c0000
+; GFX10PLUS-NEXT:    s_mov_b32 s60, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s61, 0x402a0000
+; GFX10PLUS-NEXT:    s_mov_b32 s58, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s59, 0x40280000
+; GFX10PLUS-NEXT:    s_mov_b32 s56, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s57, 0x40260000
+; GFX10PLUS-NEXT:    s_mov_b32 s54, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s55, 0x40240000
+; GFX10PLUS-NEXT:    s_mov_b32 s52, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s53, 0x40220000
+; GFX10PLUS-NEXT:    s_mov_b32 s50, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s51, 0x40200000
+; GFX10PLUS-NEXT:    s_mov_b32 s48, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s49, 0x401c0000
+; GFX10PLUS-NEXT:    s_mov_b32 s46, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s47, 0x40180000
+; GFX10PLUS-NEXT:    s_mov_b32 s44, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s45, 0x40140000
 ; GFX10PLUS-NEXT:    s_mov_b64 s[42:43], 4.0
+; GFX10PLUS-NEXT:    s_mov_b32 s40, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s41, 0x40080000
 ; GFX10PLUS-NEXT:    s_mov_b64 s[38:39], 2.0
 ; GFX10PLUS-NEXT:    s_movrels_b64 s[0:1], s[36:37]
@@ -3176,9 +3176,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
-; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index cfe434b84b4d7..a9876ce5d99bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -2425,8 +2425,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-LABEL: store_load_i64_aligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
@@ -2440,8 +2439,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
@@ -2484,8 +2482,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX11-LABEL: store_load_i64_aligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; UNALIGNED_GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
@@ -2499,8 +2496,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
@@ -2549,8 +2545,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-LABEL: store_load_i64_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
@@ -2564,8 +2559,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 61b87d19c6b6a..5f18c0cd2aca5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -3320,8 +3320,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_or_b32_e32 v3, s3, v3
 ; CI-NEXT:  .LBB13_16: ; %Flow50
 ; CI-NEXT:    v_cmp_nlg_f64_e64 vcc, s[8:9], 0
-; CI-NEXT:    v_mov_b32_e32 v4, 0
 ; CI-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; CI-NEXT:    v_mov_b32_e32 v4, 0
 ; CI-NEXT:    v_mov_b32_e32 v5, 0x7ff00000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
@@ -3522,8 +3522,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_or_b32_e32 v3, s3, v3
 ; VI-NEXT:  .LBB13_16: ; %Flow50
 ; VI-NEXT:    v_cmp_nlg_f64_e64 vcc, s[8:9], 0
-; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x7ff00000
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index c7b63f749e950..bc50852fb7918 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -81,14 +81,14 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_and_b32 s1, s5, 1
 ; GFX11-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s1
 ; GFX11-NEXT:    s_lshl_b32 s1, s2, s1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_not1_b32 s0, s0, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %ptr
@@ -123,10 +123,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v2, s0, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_or_b32_e32 v2, s1, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -172,11 +172,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    s_and_b32 s0, s3, 1
 ; GFX11-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s0
 ; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX11-NEXT:    s_not_b32 s1, s2
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v2, s1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
@@ -256,11 +255,10 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_and_b32 s1, s4, 1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v0
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_not1_b32 s0, s0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -333,8 +331,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
-; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, s0, v3, v2
@@ -350,9 +348,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_not_b32_e32 v3, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_not_b32_e32 v3, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, s0, v3, v2
@@ -437,10 +435,9 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v1, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -515,8 +512,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
-; GFX10-NEXT:    v_not_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_not_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, v2
@@ -532,9 +529,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_not_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_not_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, v4, v2
@@ -570,10 +567,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v3, s0, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -615,7 +612,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-LABEL: insertelement_v_v2i16_v_s:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v2
 ; GFX11-NEXT:    s_and_b32 s0, s2, 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
@@ -624,7 +621,6 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_not_b32 s0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, s0, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
@@ -694,8 +690,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, v4, v3, v2
@@ -710,10 +706,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v0, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -866,11 +861,11 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
 ; GFX10-NEXT:    s_not_b32 s2, s3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_and_or_b32 v4, v2, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -889,13 +884,11 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    s_not_b32 s2, s3
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v4, v2, s2, s1
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(1 ) %ptr
@@ -923,8 +916,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -988,6 +981,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
@@ -998,7 +992,6 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -1013,20 +1006,19 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
 ; GFX11-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX11-NEXT:    s_and_b32 s4, s4, 1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX11-NEXT:    s_and_not1_b32 s3, s3, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v4
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(4) %ptr
@@ -1056,8 +1048,8 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1083,8 +1075,8 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1159,10 +1151,9 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v5, v5, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -1193,8 +1184,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1219,8 +1210,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1294,10 +1285,9 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
 ; GFX11-NEXT:    v_not_b32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v5, v5, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -1321,8 +1311,8 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -1343,8 +1333,8 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -1418,10 +1408,9 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_not_b32_e32 v3, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v4, v4, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -1538,9 +1527,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_and_or_b32 v4, v3, s0, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -1563,8 +1551,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_not_b32_e32 v3, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
@@ -1584,8 +1572,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
@@ -1660,9 +1648,8 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_and_or_b32 v4, v4, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -1819,7 +1806,7 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_lshr_b32 s6, s5, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cselect_b32 s7, s1, s0
@@ -1843,10 +1830,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0
-; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(4) %ptr
@@ -1957,6 +1942,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s3
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX10-NEXT:    s_not_b32 s3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
@@ -1964,7 +1950,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
@@ -1986,17 +1971,15 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX11-NEXT:    s_not_b32 s3, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v4, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v6, v4, s3, s2
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
@@ -2037,8 +2020,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -2122,6 +2105,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX10-NEXT:    s_cmp_eq_u32 s5, 2
@@ -2138,7 +2122,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX10-NEXT:    s_andn2_b32 s6, s6, s7
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
@@ -2156,6 +2139,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
@@ -2174,9 +2158,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v1, v6
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
@@ -2218,10 +2200,10 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -2255,10 +2237,10 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -2357,10 +2339,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v0, s11, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v7, v7, v5, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
@@ -2401,10 +2382,10 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -2437,10 +2418,10 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -2542,9 +2523,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX11-NEXT:    v_and_or_b32 v7, v7, v5, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
@@ -2572,8 +2552,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
@@ -2600,8 +2580,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
@@ -2653,6 +2633,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -2667,7 +2648,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
 ; GFX10-NEXT:    v_and_or_b32 v9, v2, v7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
@@ -2681,7 +2661,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX11-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -2696,10 +2677,9 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v9, v2, v7, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v7, 0
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
@@ -2876,8 +2856,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
@@ -2903,8 +2883,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
@@ -2955,6 +2935,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -2964,7 +2945,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_not_b32_e32 v2, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
@@ -2981,7 +2961,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v9, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -2998,8 +2979,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX11-NEXT:    v_not_b32_e32 v2, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v3, v3, v2, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
@@ -3086,8 +3066,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i
 ; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_movreld_b32 s8, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
@@ -3171,9 +3151,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s0
 ; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
-; GFX11-NEXT:    v_mov_b32_e32 v11, 0
+; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, 0
+; GFX11-NEXT:    v_dual_mov_b32 v10, 16 :: v_dual_mov_b32 v11, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_movrels_b32 s3, s8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3304,13 +3283,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
 ; GFX10-NEXT:    s_lshl_b32 s2, 0xffff, s0
 ; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX10-NEXT:    s_not_b32 s1, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_movrels_b32_e32 v0, v2
 ; GFX10-NEXT:    v_and_or_b32 v12, v0, s1, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_movreld_b32_e32 v2, v12
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[6:9], off
@@ -3328,14 +3307,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
 ; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s0
 ; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX11-NEXT:    s_not_b32 s1, s2
-; GFX11-NEXT:    v_mov_b32_e32 v10, 16
-; GFX11-NEXT:    v_mov_b32_e32 v11, 0
+; GFX11-NEXT:    v_dual_mov_b32 v10, 16 :: v_dual_mov_b32 v11, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_movrels_b32_e32 v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v12, v0, s1, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_movreld_b32_e32 v2, v12
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
@@ -3474,6 +3452,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GFX10-NEXT:    s_lshl_b32 s1, 0xffff, s0
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3489,7 +3468,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s14
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_movreld_b32_e32 v0, v12
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off
@@ -3503,7 +3481,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GFX11-NEXT:    s_lshl_b32 s1, 0xffff, s0
-; GFX11-NEXT:    v_mov_b32_e32 v10, 16
+; GFX11-NEXT:    v_dual_mov_b32 v10, 16 :: v_dual_mov_b32 v9, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_movrels_b32 s2, s8
@@ -3515,7 +3493,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s14
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX11-NEXT:    v_movreld_b32_e32 v0, v12
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
@@ -3713,6 +3690,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s5, s4, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
@@ -3745,12 +3723,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
@@ -3764,46 +3741,45 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_load_b256 s[8:15], s[2:3], 0x0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 1, v0
 ; GFX11-NEXT:    s_and_b32 s5, s4, 0xffff
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_and_b32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v8, v0, s5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s11, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s12, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s12, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s13, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s14, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, v1, s15, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v13, v10, v9, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
 ; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
+; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX11-NEXT:    v_mov_b32_e32 v11, 0
+; GFX11-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
@@ -4001,6 +3977,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
@@ -4034,12 +4011,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
-; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
@@ -4054,47 +4030,46 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 1, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v11, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 4, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 5, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 6, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v1, 0xffff
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, v1, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s13, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v10, v2, s15, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
 ; GFX11-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX11-NEXT:    v_and_or_b32 v13, v10, v9, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, s12
+; GFX11-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v9, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s14
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v1, v13
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s6
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
-; GFX11-NEXT:    v_mov_b32_e32 v11, 0
+; GFX11-NEXT:    v_mov_b32_e32 v10, 16
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, v13, s4
@@ -4129,8 +4104,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v14, 0
@@ -4176,8 +4151,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v13, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v14, 0
@@ -4259,8 +4234,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    s_and_b32 s5, s2, 0xffff
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 16
-; GFX10-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
@@ -4269,6 +4244,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v2, 0xffff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, s5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
@@ -4284,7 +4260,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s5
 ; GFX10-NEXT:    v_and_or_b32 v15, v1, v11, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v11, 0
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s0
@@ -4304,17 +4279,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX11-NEXT:    s_and_b32 s5, s2, 0xffff
-; GFX11-NEXT:    v_dual_mov_b32 v13, 16 :: v_dual_and_b32 v2, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v13, 16
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 4, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 5, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v11, v2, 0xffff
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, s5
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
@@ -4331,11 +4307,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v15, v1, v11, v2
-; GFX11-NEXT:    v_mov_b32_e32 v11, 0
-; GFX11-NEXT:    v_mov_b32_e32 v12, 0
-; GFX11-NEXT:    v_dual_mov_b32 v14, 0 :: v_dual_cndmask_b32 v1, v4, v15
+; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v14, 0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v15, s6
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v15, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s1
@@ -4476,21 +4451,20 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
 ; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off offset:16
-; GFX11-NEXT:    v_dual_mov_b32 v11, 16 :: v_dual_and_b32 v0, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX11-NEXT:    s_and_b32 s0, s2, 1
 ; GFX11-NEXT:    s_lshr_b32 m0, s2, 1
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX11-NEXT:    v_mov_b32_e32 v12, 0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v11, 16 :: v_dual_lshlrev_b32 v0, s0, v0
 ; GFX11-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX11-NEXT:    s_not_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_movrels_b32_e32 v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v2, v1, s0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_movreld_b32_e32 v3, v2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[3:6], off
@@ -4521,8 +4495,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v14, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v15, 0
@@ -4567,8 +4541,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v14, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v15, 0
@@ -4649,6 +4623,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v14, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -4664,7 +4639,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    v_not_b32_e32 v3, v12
 ; GFX10-NEXT:    v_mov_b32_e32 v12, 0
-; GFX10-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
@@ -4693,9 +4667,10 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
-; GFX11-NEXT:    v_dual_mov_b32 v14, 16 :: v_dual_and_b32 v3, 1, v3
-; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v14, 16
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
@@ -4705,14 +4680,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 6, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_lshlrev_b32 v2, v3, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v12, v3, 0xffff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_not_b32_e32 v3, v12
-; GFX11-NEXT:    v_mov_b32_e32 v12, 0
-; GFX11-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_cndmask_b32 v1, v4, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 7c212f1e110d1..56401c25f55ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -99,10 +99,9 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -198,12 +197,12 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s2, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s3, 0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
@@ -305,9 +304,8 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -389,9 +387,9 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v1, s4, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
@@ -409,12 +407,12 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s4, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -493,10 +491,10 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
@@ -514,10 +512,9 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xff, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -614,12 +611,12 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -714,11 +711,10 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -813,11 +809,10 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -910,10 +905,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX8-NEXT:    s_lshl_b32 s0, 0xff, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v2, s0, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_or_b32_e32 v2, s1, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -959,11 +954,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_and_b32 s0, s3, 3
 ; GFX11-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_lshl_b32 s2, 0xff, s0
 ; GFX11-NEXT:    s_lshl_b32 s0, s1, s0
 ; GFX11-NEXT:    s_not_b32 s1, s2
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v2, s1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
@@ -1043,11 +1037,10 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_and_b32 s1, s4, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xff, v0
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_lshl_b32 s2, 0xff, s1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_not1_b32 s0, s0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1120,8 +1113,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
-; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, s0, v3, v2
@@ -1137,9 +1130,9 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xff
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_not_b32_e32 v3, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_not_b32_e32 v3, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, s0, v3, v2
@@ -1224,10 +1217,9 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v1, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1302,8 +1294,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
-; GFX10-NEXT:    v_not_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_not_b32_e32 v4, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, v2
@@ -1319,9 +1311,9 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xff
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_not_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_not_b32_e32 v4, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, v4, v2
@@ -1357,10 +1349,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v3, s0, 0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -1402,7 +1394,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-LABEL: insertelement_v_v4i8_v_s:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xff, v2
 ; GFX11-NEXT:    s_and_b32 s0, s2, 3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 3
@@ -1411,7 +1403,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_not_b32 s0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, s0, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
@@ -1481,8 +1472,8 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, 0xff
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_not_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_or_b32 v2, v4, v3, v2
@@ -1497,10 +1488,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v0, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1619,7 +1609,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    s_lshr_b32 s2, s5, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cselect_b32 s3, s1, s0
@@ -1636,8 +1626,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX11-NEXT:    s_cselect_b32 s1, s3, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %ptr
@@ -1729,11 +1718,11 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, s1
 ; GFX10-NEXT:    s_not_b32 s2, s3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_and_or_b32 v4, v2, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1752,13 +1741,11 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_not_b32 s2, s3
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s0, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v4, v2, s2, s1
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(1 ) %ptr
@@ -1786,8 +1773,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1851,6 +1838,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 3
@@ -1861,7 +1849,6 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -1876,20 +1863,19 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_cmp_eq_u32 s2, 1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
 ; GFX11-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX11-NEXT:    s_and_b32 s4, s4, 3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    s_lshl_b32 s5, 0xff, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s5, 0xff, s4
 ; GFX11-NEXT:    s_and_not1_b32 s3, s3, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v4
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %ptr
@@ -1919,8 +1905,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -1946,8 +1932,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2022,10 +2008,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v5, v5, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -2056,8 +2041,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
@@ -2082,8 +2067,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2157,10 +2142,9 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
 ; GFX11-NEXT:    v_not_b32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v5, v5, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -2184,8 +2168,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -2206,8 +2190,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -2281,10 +2265,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_not_b32_e32 v3, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v4, v4, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -2401,9 +2384,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_and_or_b32 v4, v3, s0, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -2426,8 +2408,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_not_b32_e32 v3, v3
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
@@ -2447,8 +2429,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
@@ -2523,9 +2505,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_and_or_b32 v4, v4, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -2682,7 +2663,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_lshr_b32 s6, s5, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cselect_b32 s7, s1, s0
@@ -2706,10 +2687,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_cselect_b32 s2, s4, s2
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-NEXT:    s_cselect_b32 s3, s4, s3
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0
-; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(4) %ptr
@@ -2820,6 +2799,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xff, s3
 ; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX10-NEXT:    s_not_b32 s3, s5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
@@ -2827,7 +2807,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, s2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
@@ -2849,17 +2828,15 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX11-NEXT:    s_not_b32 s3, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v4, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v6, v4, s3, s2
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
@@ -2900,8 +2877,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -2985,6 +2962,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX10-NEXT:    s_cmp_eq_u32 s5, 2
@@ -3001,7 +2979,6 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10-NEXT:    s_andn2_b32 s6, s6, s7
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
@@ -3019,6 +2996,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
@@ -3037,9 +3015,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v1, v6
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
@@ -3081,10 +3057,10 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -3118,10 +3094,10 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -3220,10 +3196,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v0, s11, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v7, v7, v5, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
@@ -3264,10 +3239,10 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
@@ -3300,10 +3275,10 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -3405,9 +3380,8 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX11-NEXT:    v_and_or_b32 v7, v7, v5, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
@@ -3435,8 +3409,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
@@ -3463,8 +3437,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
@@ -3516,6 +3490,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v2
 ; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -3530,7 +3505,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
 ; GFX10-NEXT:    v_and_or_b32 v9, v2, v7, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
@@ -3544,7 +3518,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 2, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 3, v2
 ; GFX11-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -3559,10 +3534,9 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v9, v2, v7, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v7, 0
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
@@ -3739,8 +3713,8 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
@@ -3766,8 +3740,8 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
@@ -3818,6 +3792,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 2, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 3, v3
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -3827,7 +3802,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_not_b32_e32 v2, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
@@ -3844,7 +3818,8 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v9, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 3, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
@@ -3861,8 +3836,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX11-NEXT:    v_not_b32_e32 v2, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v3, v3, v2, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 6738c5e224267..d4f7fc2bc6bb0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -686,17 +686,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_const_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b32 s18, 0
-; GPRIDX-NEXT:    s_mov_b32 s16, 0
-; GPRIDX-NEXT:    s_mov_b32 s14, 0
-; GPRIDX-NEXT:    s_mov_b32 s12, 0
-; GPRIDX-NEXT:    s_mov_b32 s8, 0
 ; GPRIDX-NEXT:    s_mov_b64 s[4:5], 1.0
+; GPRIDX-NEXT:    s_mov_b32 s18, 0
 ; GPRIDX-NEXT:    s_mov_b32 s19, 0x40200000
+; GPRIDX-NEXT:    s_mov_b32 s16, 0
 ; GPRIDX-NEXT:    s_mov_b32 s17, 0x401c0000
+; GPRIDX-NEXT:    s_mov_b32 s14, 0
 ; GPRIDX-NEXT:    s_mov_b32 s15, 0x40180000
+; GPRIDX-NEXT:    s_mov_b32 s12, 0
 ; GPRIDX-NEXT:    s_mov_b32 s13, 0x40140000
 ; GPRIDX-NEXT:    s_mov_b64 s[10:11], 4.0
+; GPRIDX-NEXT:    s_mov_b32 s8, 0
 ; GPRIDX-NEXT:    s_mov_b32 s9, 0x40080000
 ; GPRIDX-NEXT:    s_mov_b64 s[6:7], 2.0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s4
@@ -754,15 +754,15 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1.0
 ; GFX10-NEXT:    s_mov_b32 s18, 0
-; GFX10-NEXT:    s_mov_b32 s16, 0
-; GFX10-NEXT:    s_mov_b32 s14, 0
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_mov_b32 s19, 0x40200000
+; GFX10-NEXT:    s_mov_b32 s16, 0
 ; GFX10-NEXT:    s_mov_b32 s17, 0x401c0000
+; GFX10-NEXT:    s_mov_b32 s14, 0
 ; GFX10-NEXT:    s_mov_b32 s15, 0x40180000
+; GFX10-NEXT:    s_mov_b32 s12, 0
 ; GFX10-NEXT:    s_mov_b32 s13, 0x40140000
 ; GFX10-NEXT:    s_mov_b64 s[10:11], 4.0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_mov_b32 s9, 0x40080000
 ; GFX10-NEXT:    s_mov_b64 s[6:7], 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
@@ -818,17 +818,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX11-LABEL: dyn_insertelement_v8f64_const_s_v_v:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s14, 0
 ; GFX11-NEXT:    s_mov_b32 s15, 0x40200000
-; GFX11-NEXT:    s_mov_b32 s12, 0
-; GFX11-NEXT:    s_mov_b32 s10, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 1.0
+; GFX11-NEXT:    s_mov_b32 s14, 0
+; GFX11-NEXT:    s_mov_b32 s12, 0
 ; GFX11-NEXT:    s_mov_b32 s13, 0x401c0000
+; GFX11-NEXT:    s_mov_b32 s10, 0
 ; GFX11-NEXT:    s_mov_b32 s11, 0x40180000
+; GFX11-NEXT:    s_mov_b32 s8, 0
 ; GFX11-NEXT:    s_mov_b32 s9, 0x40140000
 ; GFX11-NEXT:    s_mov_b64 s[6:7], 4.0
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b32 s5, 0x40080000
 ; GFX11-NEXT:    s_mov_b64 s[2:3], 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index 9bbdc2982138c..df24d78db764d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -108,12 +108,10 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ; GFX1170:       ; %bb.0:
 ; GFX1170-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1170-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1170-NEXT:    v_min_num_f64 v[0:1], v[0:1], v[2:3]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX1170-NEXT:    v_max_num_f64 v[0:1], v[0:1], v[2:3]
 ; GFX1170-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -125,12 +123,10 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
@@ -160,12 +156,10 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; GFX1170:       ; %bb.0:
 ; GFX1170-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1170-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1170-NEXT:    v_min_num_f64 v[0:1], v[0:1], v[2:3]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX1170-NEXT:    v_max_num_f64 v[0:1], v[0:1], v[2:3]
 ; GFX1170-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -177,12 +171,10 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call double @llvm.fabs.f64(double %src)
@@ -254,12 +246,10 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ; GFX1170:       ; %bb.0:
 ; GFX1170-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1170-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1170-NEXT:    v_min_num_f64 v[0:1], v[0:1], v[2:3]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX1170-NEXT:    v_max_num_f64 v[0:1], v[0:1], v[2:3]
 ; GFX1170-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -271,12 +261,10 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double poison)
@@ -346,12 +334,10 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; GFX1170:       ; %bb.0:
 ; GFX1170-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1170-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX1170-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1170-NEXT:    v_min_num_f64 v[0:1], v[0:1], v[2:3]
-; GFX1170-NEXT:    v_mov_b32_e32 v2, -1
-; GFX1170-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX1170-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX1170-NEXT:    v_max_num_f64 v[0:1], v[0:1], v[2:3]
 ; GFX1170-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -363,12 +349,10 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0xffefffff
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 9ccfde8d4c37c..3571bf191fcd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -70,11 +70,11 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN-LABEL: set_inactive_imm_poison_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6b13bf675e036..fc4ff6f835963 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1729,18 +1729,18 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX11-LABEL: v_lshr_i65:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
-; GFX11-NEXT:    v_mov_b32_e32 v4, 1
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2
+; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 0
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
 ; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
 ; GFX11-NEXT:    v_or_b32_e32 v2, v6, v8
 ; GFX11-NEXT:    v_or_b32_e32 v6, v7, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
@@ -1755,8 +1755,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 1
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
@@ -1768,8 +1768,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
@@ -1781,8 +1781,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
@@ -1794,8 +1794,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 5c80c27c3d280..6194b197b3850 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -72,11 +72,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -84,11 +84,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 ;
 ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -110,11 +110,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -122,11 +122,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in
 ;
 ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -667,10 +667,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -679,10 +679,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -706,10 +706,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -718,10 +718,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) in
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -1242,11 +1242,11 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
@@ -1257,11 +1257,11 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ;
 ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
@@ -1469,11 +1469,11 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr, i32 %old, i32 %in) {
 ; GFX6-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc
@@ -1485,11 +1485,11 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ;
 ; GFX7-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc
@@ -1560,8 +1560,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
 define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1) %ptr, i32 %old, i32 %in) {
 ; GFX6-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -1574,8 +1574,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ;
 ; GFX7-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1ad3aa063ef5a..238bd9717c7b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -354,8 +354,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr
 ; GFX11-LABEL: v_mul_i64_masked_src0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -443,8 +443,8 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p
 ; GFX11-LABEL: v_mul64_masked_before_branch:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index 3bf687523727d..3fe174b96979f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -657,10 +657,10 @@ define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1)
 ; GFX6-LABEL: s_orn2_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s7
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -710,10 +710,10 @@ define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inre
 ; GFX6-LABEL: s_orn2_v3i16_commute:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s7
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
@@ -763,10 +763,10 @@ define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3
 ; GFX6-LABEL: s_orn2_v3i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 16
-; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s4, s4, s7
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 3194581fa4213..2176761b94516 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -271,9 +271,9 @@ define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_add_i32 s4, s4, s9
-; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
 ; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s5
 ; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, 0
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index bb7beb8d0b9e2..4c10e4d459849 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -521,6 +521,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s0
 ; GFX908-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s2, v0
@@ -541,11 +542,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v19, s2
 ; GFX908-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
 ; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; GFX908-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s2, v16
 ; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index dfd56b0917486..f5fdee766c9c5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -3049,9 +3049,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v9
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v8, vcc, s2, v10
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
@@ -3134,9 +3134,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
@@ -3325,14 +3325,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v7
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v9
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s2, v10
-; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
 ;
@@ -3441,12 +3440,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s3, v7
 ; GFX1264_DPP-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1264_DPP-NEXT:    v_add_co_u32 v6, vcc, s2, v8
-; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1264_DPP-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e64 v7, null, s3, v9, vcc
 ; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1264_DPP-NEXT:    buffer_store_b64 v[6:7], off, s[0:3], null
 ; GFX1264_DPP-NEXT:    s_endpgm
 ;
@@ -3526,15 +3525,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v10, v6
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v11, v7
+; GFX1232_DPP-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v7
 ; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s3, v9
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s2, v10
-; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1232_DPP-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
 ; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1232_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
 ; GFX1232_DPP-NEXT:    s_endpgm
 entry:
@@ -7552,9 +7550,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v7
 ; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v9
 ; GFX1064_DPP-NEXT:    v_sub_co_u32 v8, vcc, s2, v10
-; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc
 ; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
 ;
@@ -7655,9 +7653,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v8
 ; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
 ; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s2, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
 ;
@@ -7889,18 +7887,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:  ; %bb.3: ; %Flow
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s11
 ; GFX1132_DPP-NEXT:  .LBB11_4: ; %Flow3
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s10
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v7
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v9
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s2, v10
-; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
 ;
@@ -8009,12 +8006,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s3, v7
 ; GFX1264_DPP-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1264_DPP-NEXT:    v_sub_co_u32 v6, vcc, s2, v8
-; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1264_DPP-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX1264_DPP-NEXT:    v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc
 ; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1264_DPP-NEXT:    buffer_store_b64 v[6:7], off, s[0:3], null
 ; GFX1264_DPP-NEXT:    s_endpgm
 ;
@@ -8094,15 +8091,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232_DPP-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v10, v6
-; GFX1232_DPP-NEXT:    v_mov_b32_e32 v11, v7
+; GFX1232_DPP-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v7
 ; GFX1232_DPP-NEXT:    v_readfirstlane_b32 s3, v9
-; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s2, v10
-; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1232_DPP-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX1232_DPP-NEXT:    v_sub_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
 ; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1232_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
 ; GFX1232_DPP-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 95908b2b666cf..be6de57674985 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2541,8 +2541,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
 ; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s3, v11
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s4, v12, vcc
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
@@ -2620,8 +2620,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
 ; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s3, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -2800,13 +2800,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v7
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s3, v10
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e64 v9, null, s4, v11, vcc_lo
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -3268,8 +3267,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
-; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3330,12 +3329,12 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0) depctr_va_vcc(0)
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e64 v3, null, v1, v4, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1164_DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v7
 ; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1164_DPP-NEXT:  ; %bb.1:
@@ -3381,11 +3380,11 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_DPP-NEXT:    v_permlanex16_b32 v4, v1, 0, 0
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e64 v3, null, v1, v4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v2
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1132_DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v7
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1132_DPP-NEXT:  ; %bb.1:
@@ -5954,8 +5953,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    s_mov_b32 null, 0
 ; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1064_DPP-NEXT:    v_sub_co_u32 v9, vcc, s3, v11
-; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc, s4, v12, vcc
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
@@ -6033,8 +6032,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_mov_b32 null, 0
 ; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1032_DPP-NEXT:    v_sub_co_u32 v9, vcc_lo, s3, v11
-; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -6213,13 +6212,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v6
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v7
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v7
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_sub_co_u32 v8, vcc_lo, s3, v10
-; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e64 v9, null, s4, v11, vcc_lo
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -7562,8 +7560,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v9, s3, v9
@@ -8910,8 +8907,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_or_b32_e32 v9, s3, v9
@@ -10258,8 +10254,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v8
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132_DPP-NEXT:    v_xor_b32_e32 v9, s3, v9
@@ -11083,8 +11078,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
@@ -12082,8 +12077,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
@@ -12907,8 +12901,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132-NEXT:    s_cbranch_execz .LBB25_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
@@ -13906,8 +13900,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
@@ -14727,8 +14720,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
@@ -15715,8 +15708,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -16537,8 +16529,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1132-NEXT:    s_cbranch_execz .LBB31_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 5
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
@@ -17523,8 +17515,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s5, v8
 ; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v4
-; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
index d08185a9e0ccd..5286ee30cdbf5 100644
--- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
@@ -62,8 +62,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_0
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr1
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
 ...
 
@@ -73,8 +73,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_neg1
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr0_agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr1
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO -1, implicit $exec
 ...
 
@@ -84,8 +84,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr1
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
 ...
 
@@ -95,8 +95,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
-    ; GFX908: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
     ;
     ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
     ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec
@@ -112,12 +112,12 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
-    ; GFX908: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
     ;
     ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
-    ; GFX90A: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
     ;
     ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
     ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 64, implicit $exec
@@ -130,8 +130,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64_hi_0_lo
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0_agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr1
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 274877906944, implicit $exec
 ...
 
@@ -141,8 +141,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64_hi_2_lo
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0_agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr1
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 274877906946, implicit $exec
 ...
 
@@ -152,8 +152,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_neg16_hi_9_lo
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr0_agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr1
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
 
@@ -163,24 +163,24 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
-    ; GFX908: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2_vgpr3
-    ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr2_vgpr3
-    ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr4_vgpr5
-    ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr4_vgpr5
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2
+    ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr3
+    ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr4
+    ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr5
     ;
     ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
-    ; GFX90A: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2_vgpr3
-    ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr2_vgpr3
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2
+    ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr3
     ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, 1042479491, 8, 1042479491, 0, 0, 0, 0, 0, implicit $exec
     ;
     ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
     ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 1042479491, implicit $exec
-    ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2_vgpr3
-    ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr2_vgpr3
+    ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2
+    ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr3
     ; GFX942-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, 1042479491, 8, 1042479491, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0_vgpr1 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
     $vgpr2_vgpr3 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
@@ -193,8 +193,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_unaligned_agpr
-    ; CHECK: $agpr1 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr1_agpr2
-    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr1_agpr2
+    ; CHECK: $agpr1 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr1
+    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr2
     $agpr1_agpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
 
@@ -204,8 +204,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_unaligned_vgpr
-    ; CHECK: $vgpr1 = V_MOV_B32_e32 9, implicit $exec, implicit-def $vgpr1_vgpr2
-    ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2
+    ; CHECK: $vgpr1 = V_MOV_B32_e32 9, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr2
     $vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
 
@@ -214,8 +214,8 @@ name: av_mov_b64_misalign_vgpr
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
-    ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
-    ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+    ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5
+    ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr6
     $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
 ...
 
@@ -224,7 +224,7 @@ name: av_mov_b64_misalign_agpr
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_misalign_agpr
-    ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
-    ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+    ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5
+    ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr6
     $agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 5baa5257570b3..b7ee91a8ad696 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1487,7 +1487,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v2i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_bfe_u32 s1, s0, 0x80008
 ; GFX11-NEXT:    s_add_i32 s0, s0, s0
@@ -1496,7 +1496,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1579,7 +1579,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v4i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s2, s0, 24
@@ -1598,7 +1598,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1664,15 +1664,15 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; VI-NEXT:    v_mov_b32_e32 v0, 2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    s_bfe_u32 s2, s0, 0x80008
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    s_add_i32 s0, s0, s0
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
-; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1683,7 +1683,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v3i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
@@ -1693,9 +1694,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX11-NEXT:    s_add_i32 s1, s1, s1
 ; GFX11-NEXT:    s_or_b32 s0, s0, s2
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1
+; GFX11-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
@@ -1780,11 +1780,11 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s3, s4, 8
 ; VI-NEXT:    s_or_b32 s0, s0, s3
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_lshl_b32 s2, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1795,7 +1795,8 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v5i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
@@ -1814,9 +1815,8 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX11-NEXT:    s_add_i32 s1, s1, s1
 ; GFX11-NEXT:    s_or_b32 s0, s0, s2
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1
+; GFX11-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
@@ -1949,6 +1949,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v8i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
@@ -1982,9 +1983,8 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 16
 ; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    s_or_b32 s1, s1, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2195,6 +2195,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v16i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 24
@@ -2260,9 +2261,10 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; GFX11-NEXT:    s_or_b32 s2, s2, s8
 ; GFX11-NEXT:    s_or_b32 s0, s0, s4
 ; GFX11-NEXT:    s_or_b32 s1, s1, s5
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2511,6 +2513,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_lshr_b32 s13, s6, 24
 ; VI-NEXT:    s_lshr_b32 s14, s7, 16
 ; VI-NEXT:    s_lshr_b32 s15, s7, 24
+; VI-NEXT:    s_lshr_b32 s16, s0, 16
+; VI-NEXT:    s_lshr_b32 s17, s0, 24
+; VI-NEXT:    s_lshr_b32 s18, s1, 16
+; VI-NEXT:    s_lshr_b32 s19, s1, 24
+; VI-NEXT:    s_lshr_b32 s20, s2, 16
+; VI-NEXT:    s_lshr_b32 s21, s2, 24
+; VI-NEXT:    s_lshr_b32 s22, s3, 16
+; VI-NEXT:    s_lshr_b32 s23, s3, 24
 ; VI-NEXT:    s_bfe_u32 s24, s4, 0x80008
 ; VI-NEXT:    s_bfe_u32 s25, s5, 0x80008
 ; VI-NEXT:    s_bfe_u32 s26, s6, 0x80008
@@ -2523,14 +2533,18 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_add_i32 s10, s10, s10
 ; VI-NEXT:    s_add_i32 s9, s9, s9
 ; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    s_lshr_b32 s16, s0, 16
-; VI-NEXT:    s_lshr_b32 s17, s0, 24
-; VI-NEXT:    s_lshr_b32 s18, s1, 16
-; VI-NEXT:    s_lshr_b32 s19, s1, 24
-; VI-NEXT:    s_lshr_b32 s20, s2, 16
-; VI-NEXT:    s_lshr_b32 s21, s2, 24
-; VI-NEXT:    s_lshr_b32 s22, s3, 16
-; VI-NEXT:    s_lshr_b32 s23, s3, 24
+; VI-NEXT:    s_bfe_u32 s28, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s29, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s30, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s31, s3, 0x80008
+; VI-NEXT:    s_add_i32 s23, s23, s23
+; VI-NEXT:    s_add_i32 s22, s22, s22
+; VI-NEXT:    s_add_i32 s21, s21, s21
+; VI-NEXT:    s_add_i32 s20, s20, s20
+; VI-NEXT:    s_add_i32 s19, s19, s19
+; VI-NEXT:    s_add_i32 s18, s18, s18
+; VI-NEXT:    s_add_i32 s17, s17, s17
+; VI-NEXT:    s_add_i32 s16, s16, s16
 ; VI-NEXT:    s_lshl_b32 s15, s15, 8
 ; VI-NEXT:    s_and_b32 s14, s14, 0xff
 ; VI-NEXT:    s_add_i32 s7, s7, s7
@@ -2547,30 +2561,6 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_and_b32 s8, s8, 0xff
 ; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_add_i32 s24, s24, s24
-; VI-NEXT:    s_bfe_u32 s28, s0, 0x80008
-; VI-NEXT:    s_bfe_u32 s29, s1, 0x80008
-; VI-NEXT:    s_bfe_u32 s30, s2, 0x80008
-; VI-NEXT:    s_bfe_u32 s31, s3, 0x80008
-; VI-NEXT:    s_add_i32 s23, s23, s23
-; VI-NEXT:    s_add_i32 s22, s22, s22
-; VI-NEXT:    s_add_i32 s21, s21, s21
-; VI-NEXT:    s_add_i32 s20, s20, s20
-; VI-NEXT:    s_add_i32 s19, s19, s19
-; VI-NEXT:    s_add_i32 s18, s18, s18
-; VI-NEXT:    s_add_i32 s17, s17, s17
-; VI-NEXT:    s_add_i32 s16, s16, s16
-; VI-NEXT:    s_or_b32 s14, s14, s15
-; VI-NEXT:    s_and_b32 s7, s7, 0xff
-; VI-NEXT:    s_lshl_b32 s15, s27, 8
-; VI-NEXT:    s_or_b32 s12, s12, s13
-; VI-NEXT:    s_and_b32 s6, s6, 0xff
-; VI-NEXT:    s_lshl_b32 s13, s26, 8
-; VI-NEXT:    s_or_b32 s10, s10, s11
-; VI-NEXT:    s_and_b32 s5, s5, 0xff
-; VI-NEXT:    s_lshl_b32 s11, s25, 8
-; VI-NEXT:    s_or_b32 s8, s8, s9
-; VI-NEXT:    s_and_b32 s4, s4, 0xff
-; VI-NEXT:    s_lshl_b32 s9, s24, 8
 ; VI-NEXT:    s_lshl_b32 s23, s23, 8
 ; VI-NEXT:    s_and_b32 s22, s22, 0xff
 ; VI-NEXT:    s_add_i32 s3, s3, s3
@@ -2587,10 +2577,18 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_and_b32 s16, s16, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
 ; VI-NEXT:    s_add_i32 s28, s28, s28
-; VI-NEXT:    s_or_b32 s7, s7, s15
-; VI-NEXT:    s_or_b32 s6, s6, s13
-; VI-NEXT:    s_or_b32 s5, s5, s11
-; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_or_b32 s14, s14, s15
+; VI-NEXT:    s_and_b32 s7, s7, 0xff
+; VI-NEXT:    s_lshl_b32 s15, s27, 8
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s13, s26, 8
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s5, s5, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s25, 8
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s24, 8
 ; VI-NEXT:    s_or_b32 s22, s22, s23
 ; VI-NEXT:    s_and_b32 s3, s3, 0xff
 ; VI-NEXT:    s_lshl_b32 s23, s31, 8
@@ -2603,6 +2601,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_or_b32 s16, s16, s17
 ; VI-NEXT:    s_and_b32 s0, s0, 0xff
 ; VI-NEXT:    s_lshl_b32 s17, s28, 8
+; VI-NEXT:    s_or_b32 s7, s7, s15
+; VI-NEXT:    s_or_b32 s6, s6, s13
+; VI-NEXT:    s_or_b32 s5, s5, s11
+; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_or_b32 s3, s3, s23
+; VI-NEXT:    s_or_b32 s2, s2, s21
+; VI-NEXT:    s_or_b32 s1, s1, s19
+; VI-NEXT:    s_or_b32 s0, s0, s17
 ; VI-NEXT:    s_lshl_b32 s14, s14, 16
 ; VI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; VI-NEXT:    s_lshl_b32 s12, s12, 16
@@ -2611,14 +2617,6 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; VI-NEXT:    s_lshl_b32 s8, s8, 16
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff
-; VI-NEXT:    s_or_b32 s3, s3, s23
-; VI-NEXT:    s_or_b32 s2, s2, s21
-; VI-NEXT:    s_or_b32 s1, s1, s19
-; VI-NEXT:    s_or_b32 s0, s0, s17
-; VI-NEXT:    s_or_b32 s7, s7, s14
-; VI-NEXT:    s_or_b32 s6, s6, s12
-; VI-NEXT:    s_or_b32 s5, s5, s10
-; VI-NEXT:    s_or_b32 s4, s4, s8
 ; VI-NEXT:    s_lshl_b32 s22, s22, 16
 ; VI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; VI-NEXT:    s_lshl_b32 s20, s20, 16
@@ -2627,14 +2625,18 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
 ; VI-NEXT:    s_lshl_b32 s16, s16, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_or_b32 s7, s7, s14
+; VI-NEXT:    s_or_b32 s6, s6, s12
+; VI-NEXT:    s_or_b32 s5, s5, s10
+; VI-NEXT:    s_or_b32 s4, s4, s8
 ; VI-NEXT:    s_or_b32 s3, s3, s22
 ; VI-NEXT:    s_or_b32 s2, s2, s20
 ; VI-NEXT:    s_or_b32 s1, s1, s18
 ; VI-NEXT:    s_or_b32 s0, s0, s16
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2648,6 +2650,8 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; GFX11-LABEL: amd_kernel_v32i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v8, 16 :: v_dual_mov_b32 v9, 0
+; GFX11-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshr_b32 s16, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s17, s0, 24
@@ -2777,12 +2781,11 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; GFX11-NEXT:    s_or_b32 s5, s5, s9
 ; GFX11-NEXT:    s_or_b32 s2, s2, s20
 ; GFX11-NEXT:    s_or_b32 s0, s0, s16
-; GFX11-NEXT:    v_dual_mov_b32 v8, 16 :: v_dual_mov_b32 v5, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v11, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index c840c6e6b3705..76f84756dc3c1 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -318,9 +318,9 @@ define <2 x half> @chain_hi_to_lo_global() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_load_short_d16_hi v0, v[1:2], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -328,11 +328,10 @@ define <2 x half> @chain_hi_to_lo_global() {
 ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global:
 ; GFX11-TRUE16:       ; %bb.0: ; %bb
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -340,11 +339,10 @@ define <2 x half> @chain_hi_to_lo_global() {
 ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global:
 ; GFX11-FAKE16:       ; %bb.0: ; %bb
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-FAKE16-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
@@ -436,9 +434,9 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) {
 ; GFX10_DEFAULT-NEXT:    s_addc_u32 s5, s17, 0
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10_DEFAULT-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    flat_load_short_d16_hi v0, v[1:2]
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -451,9 +449,9 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) {
 ; FLATSCR_GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR_GFX10-NEXT:    flat_load_ushort v0, v[0:1]
 ; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    flat_load_short_d16_hi v0, v[1:2]
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -463,9 +461,9 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) {
 ; GFX11-TRUE16:       ; %bb.0: ; %bb
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:2
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -475,9 +473,9 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) {
 ; GFX11-FAKE16:       ; %bb.0: ; %bb
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-FAKE16-NEXT:    flat_load_u16 v0, v[0:1] offset:2
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index ecafe94d4cd55..0c01ffe4df208 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -473,9 +473,8 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_i32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmpk_lt_i32 s0, 0x3e9
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -493,9 +492,8 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e8
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -513,9 +511,8 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmp_lt_u32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -533,9 +530,8 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmp_ge_i32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -553,9 +549,8 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmpk_lt_u32 s0, 0x3e9
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -573,9 +568,8 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_i32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e7
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -593,9 +587,8 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmp_le_i32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
@@ -613,9 +606,8 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s0, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index 56de9dde7c310..db6d3fc205387 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -494,8 +494,8 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-LABEL: fmin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
@@ -520,8 +520,8 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-LABEL: fmax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6a5b3bc42555b..581f7a2b83ceb 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -112,10 +112,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, v[8:9]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -141,11 +141,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, -1, v22
 ; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v21, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, -1, v0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:  .LBB0_3: ; %udiv-do-while
@@ -1231,8 +1231,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[6:7]
 ; GFX9-G-NEXT:    v_sub_co_u32_e64 v0, s[6:7], v0, v1
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v1, s[6:7], 0, 0, s[6:7]
-; GFX9-G-NEXT:    v_mov_b32_e32 v6, 0x7f
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT:    v_mov_b32_e32 v6, 0x7f
 ; GFX9-G-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7]
 ; GFX9-G-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7]
@@ -2350,10 +2350,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v15
-; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, v11, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2377,13 +2377,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, -1, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v12, v0, s[4:5]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v12, v0, s[4:5]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:  .LBB1_3: ; %udiv-do-while
@@ -3333,8 +3333,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s[6:7]
 ; GFX9-G-NEXT:    v_sub_co_u32_e64 v12, s[6:7], v8, v9
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v13, s[6:7], 0, 0, s[6:7]
-; GFX9-G-NEXT:    v_mov_b32_e32 v8, 0x7f
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT:    v_mov_b32_e32 v8, 0x7f
 ; GFX9-G-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7]
 ; GFX9-G-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 0b099cd5ac1a4..af9dbc5052dc2 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -95,9 +95,9 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ds_write_b32 v2, v3 offset:12
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
@@ -108,13 +108,12 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    ds_store_b32 v2, v3 offset:12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
@@ -157,12 +156,12 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; GFX9-NEXT:    s_mov_b64 vcc, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    ds_write_b32 v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v0, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    ds_write_b32 v4, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -171,9 +170,9 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ds_write_b32 v3, v2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -186,9 +185,8 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    ds_store_b32 v3, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
@@ -588,8 +586,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    ds_write2_b32 v3, v4, v5 offset1:1
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
@@ -602,10 +600,10 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7b
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ds_write_b32 v2, v3 offset:1023
 ; GFX10-NEXT:    ds_write_b32 v2, v4 offset:1019
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -617,14 +615,13 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x3fb, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    ds_store_2addr_b32 v2, v3, v4 offset1:1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 71af21a11c2ce..b4768bd039668 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -167,13 +167,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 9
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 10
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index fac9f5bf826a6..033ab8dc0a0ae 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -220,9 +220,9 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -317,9 +317,9 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; VI-NEXT:    s_add_i32 s12, s12, s17
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 53b6ceb45531e..6b7ddd9a49087 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2511,9 +2511,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX678-GISEL:       ; %bb.0:
 ; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -2553,8 +2553,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -2572,8 +2572,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -2583,33 +2583,19 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 }
 
 define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 {
-; GFX678-SDAG-LABEL: test_fold_canonicalize_n0_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_fold_canonicalize_n0_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_fold_canonicalize_n0_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_fold_canonicalize_n0_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -2673,33 +2659,19 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 }
 
 define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 {
-; GFX678-SDAG-LABEL: test_fold_canonicalize_p1_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_fold_canonicalize_p1_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_fold_canonicalize_p1_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -2731,8 +2703,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3ff00000 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -2748,8 +2720,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3ff00000 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -2759,33 +2731,19 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 }
 
 define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 {
-; GFX678-SDAG-LABEL: test_fold_canonicalize_n1_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v1, 0xbff00000
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_fold_canonicalize_n1_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0xbff00000
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_fold_canonicalize_n1_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -2817,8 +2775,8 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0xbff00000 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -2834,8 +2792,8 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0xbff00000 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -2845,33 +2803,19 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 }
 
 define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 {
-; GFX678-SDAG-LABEL: test_fold_canonicalize_literal_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40300000
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_fold_canonicalize_literal_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40300000
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_fold_canonicalize_literal_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -2903,8 +2847,8 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x40300000 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -2920,8 +2864,8 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x40300000 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -2949,9 +2893,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX678-GISEL:       ; %bb.0:
 ; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -2991,8 +2935,8 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX11-GISEL-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3010,8 +2954,8 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX12-GISEL-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -3021,33 +2965,19 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 }
 
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 {
-; GFX678-SDAG-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v1, 0xfffff
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0xfffff
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, -1
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -3081,8 +3011,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX11-GISEL-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0xfffff :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0xfffff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3099,8 +3029,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX12-GISEL-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0xfffff :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0xfffff
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -3110,33 +3040,19 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 }
 
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 {
-; GFX678-SDAG-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -3200,33 +3116,19 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 }
 
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 {
-; GFX678-SDAG-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v1, 0x800fffff
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800fffff
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, -1
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -3260,8 +3162,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX11-GISEL-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x800fffff :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0x800fffff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3278,8 +3180,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX12-GISEL-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x800fffff :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0x800fffff
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -3289,33 +3191,19 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 }
 
 define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 {
-; GFX678-SDAG-LABEL: test_fold_canonicalize_qnan_f64:
-; GFX678-SDAG:       ; %bb.0:
-; GFX678-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-SDAG-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX678-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-SDAG-NEXT:    s_endpgm
-;
-; GFX678-GISEL-LABEL: test_fold_canonicalize_qnan_f64:
-; GFX678-GISEL:       ; %bb.0:
-; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; GFX678-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX678-GISEL-NEXT:    s_endpgm
+; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT:    s_add_i32 s12, s12, s17
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX678-NEXT:    v_mov_b32_e32 v2, s0
+; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX678-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX9-SDAG:       ; %bb.0:
@@ -3347,8 +3235,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x7ff80000 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3364,8 +3252,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x7ff80000 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -3393,9 +3281,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX678-GISEL:       ; %bb.0:
 ; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, -1
@@ -3433,8 +3321,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3450,8 +3338,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -3479,9 +3367,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX678-GISEL:       ; %bb.0:
 ; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -2
 ; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -2
 ; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, -1
@@ -3519,8 +3407,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, -2
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, -2 :: v_dual_mov_b32 v1, -1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3536,8 +3424,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, -2
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, -2 :: v_dual_mov_b32 v1, -1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -3621,10 +3509,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ;
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x7ff00000 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 0x7ff00000
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3640,10 +3528,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ;
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0x7ff00000 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 0x7ff00000
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3672,9 +3560,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX678-GISEL:       ; %bb.0:
 ; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-GISEL-NEXT:    v_bfrev_b32_e32 v1, -2
@@ -3816,10 +3704,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ;
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0xfff00000 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 0xfff00000
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3835,10 +3723,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ;
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0xfff00000 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 0xfff00000
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3867,9 +3755,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX678-GISEL:       ; %bb.0:
 ; GFX678-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX678-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX678-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX678-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-GISEL-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX678-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX678-GISEL-NEXT:    v_mov_b32_e32 v1, -1
@@ -3907,8 +3795,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX11-GISEL-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -3924,8 +3812,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX12-GISEL-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, -1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 62f16fe2760ef..edd5d214a114e 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -102,8 +102,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <
 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_ret:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -113,8 +112,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -157,8 +155,7 @@ define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x
 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16_ret:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -168,8 +165,7 @@ define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 2fe2978f81bbd..8f03abd1124f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1479,8 +1479,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1525,8 +1525,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
@@ -1569,8 +1569,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX90A-NEXT:    buffer_wbl2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1615,8 +1615,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index dbe34d19cf3eb..dbe956d7a86fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -10,11 +10,11 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_bfe_u32 v6, v5, 20, 11
 ; SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0x3fe
-; SDAG-NEXT:    v_mov_b32_e32 v4, v0
 ; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_6
@@ -97,8 +97,8 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 52, v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 11
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
@@ -241,8 +241,8 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0x3fe
 ; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
 ; SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB1_6
@@ -294,8 +294,8 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 52, v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 11
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
@@ -359,8 +359,8 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_bfe_u32 v9, v4, 23, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v9
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
@@ -575,8 +575,8 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_bfe_u32 v6, v4, 23, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v6
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
@@ -613,8 +613,8 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
 ; SDAG-NEXT:  ; %bb.4: ; %fp-to-i-if-exp.small
 ; SDAG-NEXT:    v_sub_u32_e32 v0, 0x96, v6
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-NEXT:  ; %bb.5: ; %Flow1
@@ -670,8 +670,8 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; GISEL-NEXT:  ; %bb.4: ; %fp-to-i-if-exp.small
 ; GISEL-NEXT:    v_sub_u32_e32 v0, 0x96, v6
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:  ; %bb.5: ; %Flow1
@@ -729,8 +729,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_bfe_u32 v8, v4, 7, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v8
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
@@ -942,8 +942,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    v_bfe_u32 v5, v4, 7, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v5
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
@@ -982,8 +982,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
 ; SDAG-NEXT:  ; %bb.4: ; %fp-to-i-if-exp.small
 ; SDAG-NEXT:    v_sub_u16_e32 v0, 0x86, v5
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_lshrrev_b16_e32 v0, v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-NEXT:  ; %bb.5: ; %Flow1
@@ -1041,8 +1041,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:  ; %bb.4: ; %fp-to-i-if-exp.small
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x86
 ; GISEL-NEXT:    v_sub_u16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_lshrrev_b16_e32 v0, v0, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:  ; %bb.5: ; %Flow1
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll
index 55b457247bb8f..3446c006a5cc1 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll
@@ -1073,12 +1073,12 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s9, 0xc1f00000
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX7-ISEL-NEXT:    s_mov_b32 s6, -1
 ; GFX7-ISEL-NEXT:    s_mov_b32 s7, 0x43dfffff
 ; GFX7-ISEL-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[4:5], v[2:3], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, 0
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX7-ISEL-NEXT:    v_cmp_nle_f64_e32 vcc, s[4:5], v[0:1]
 ; GFX7-ISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], s[6:7], v[0:1]
 ; GFX7-ISEL-NEXT:    v_cmp_u_f64_e64 s[6:7], v[0:1], v[0:1]
@@ -1132,12 +1132,12 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
 ; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_mov_b32 s9, 0xc1f00000
+; GFX9-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_mov_b32 s7, 0x43dfffff
 ; GFX9-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[2:3], s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX9-NEXT:    v_cmp_nle_f64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_f64_e64 s[4:5], s[6:7], v[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], v[0:1], v[0:1]
@@ -1220,8 +1220,7 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; GFX12-GI-NEXT:    v_fma_f64 v[2:3], 0xc1f00000, v[4:5], v[2:3]
 ; GFX12-GI-NEXT:    v_cvt_i32_f64_e32 v4, v[4:5]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v6, v[2:3]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-GI-NEXT:    v_mov_b32_e32 v3, 0x43dfffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x43dfffff
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[2:3]
 ; GFX12-GI-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-GI-NEXT:    v_cndmask_b32_e64 v0, v4, 0x80000000, vcc_lo
@@ -1438,15 +1437,15 @@ define i64 @test_s_signed_i64_f64(double inreg %f) nounwind {
 ; GFX7-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[0:1], s[16:17]
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v5, 0xc3e00000
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v7, 0x43dfffff
 ; GFX7-ISEL-NEXT:    v_cmp_nge_f64_e32 vcc, s[16:17], v[4:5]
-; GFX7-ISEL-NEXT:    v_cmp_u_f64_e64 s[6:7], s[16:17], s[16:17]
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, 0
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX7-ISEL-NEXT:    v_cmp_u_f64_e64 s[6:7], s[16:17], s[16:17]
 ; GFX7-ISEL-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX7-ISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
@@ -1497,15 +1496,15 @@ define i64 @test_s_signed_i64_f64(double inreg %f) nounwind {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], s[16:17]
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0xc3e00000
 ; GFX9-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x43dfffff
 ; GFX9-NEXT:    v_cmp_nge_f64_e32 vcc, s[16:17], v[4:5]
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], s[16:17], s[16:17]
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-NEXT:    v_cmp_u_f64_e64 s[6:7], s[16:17], s[16:17]
 ; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
@@ -1585,8 +1584,7 @@ define i64 @test_s_signed_i64_f64(double inreg %f) nounwind {
 ; GFX12-GI-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
 ; GFX12-GI-NEXT:    v_cvt_i32_f64_e32 v2, v[2:3]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v4, v[0:1]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v0, -1
-; GFX12-GI-NEXT:    v_mov_b32_e32 v1, 0x43dfffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0x43dfffff
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e32 vcc_lo, s[0:1], v[0:1]
 ; GFX12-GI-NEXT:    v_cmp_u_f64_e64 s0, s[0:1], s[0:1]
 ; GFX12-GI-NEXT:    s_wait_alu depctr_va_sdst(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll
index 590c2c748bfdc..90d782e22bba0 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll
@@ -1637,15 +1637,15 @@ define <2 x i64> @test_signed_v2f64_v2i64(<2 x double> %f) {
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s7, 0xc1f00000
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX7-ISEL-NEXT:    s_mov_b32 s8, -1
 ; GFX7-ISEL-NEXT:    s_mov_b32 s9, 0x43dfffff
-; GFX7-ISEL-NEXT:    v_cmp_u_f64_e64 s[10:11], v[0:1], v[0:1]
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[8:9], v[4:5], s4
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[10:11], v[6:7], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, 0
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX7-ISEL-NEXT:    v_cmp_nle_f64_e32 vcc, s[4:5], v[0:1]
 ; GFX7-ISEL-NEXT:    v_cmp_nle_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX7-ISEL-NEXT:    v_cmp_u_f64_e64 s[10:11], v[0:1], v[0:1]
 ; GFX7-ISEL-NEXT:    v_cmp_u_f64_e64 s[12:13], v[2:3], v[2:3]
 ; GFX7-ISEL-NEXT:    v_bfrev_b32_e32 v12, 1
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[8:9], v[8:9]
@@ -1726,15 +1726,15 @@ define <2 x i64> @test_signed_v2f64_v2i64(<2 x double> %f) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xc1f00000
+; GFX9-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX9-NEXT:    s_mov_b32 s8, -1
 ; GFX9-NEXT:    s_mov_b32 s9, 0x43dfffff
-; GFX9-NEXT:    v_cmp_u_f64_e64 s[10:11], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[4:5], s4
 ; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[6:7], s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc3e00000
 ; GFX9-NEXT:    v_cmp_nle_f64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_nle_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 s[10:11], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_u_f64_e64 s[12:13], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_bfrev_b32_e32 v12, 1
 ; GFX9-NEXT:    v_floor_f64_e32 v[8:9], v[8:9]
@@ -1863,9 +1863,8 @@ define <2 x i64> @test_signed_v2f64_v2i64(<2 x double> %f) {
 ; GFX12-GI-NEXT:    v_fma_f64 v[4:5], 0xc1f00000, v[8:9], v[4:5]
 ; GFX12-GI-NEXT:    v_fma_f64 v[6:7], 0xc1f00000, v[10:11], v[6:7]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v12, v[4:5]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v4, -1
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v6, v[6:7]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v5, 0x43dfffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v5, 0x43dfffff
 ; GFX12-GI-NEXT:    v_cvt_i32_f64_e32 v7, v[8:9]
 ; GFX12-GI-NEXT:    v_cvt_i32_f64_e32 v8, v[10:11]
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e64 s1, v[0:1], v[4:5]
@@ -2148,17 +2147,17 @@ define <2 x i64> @test_s_signed_v2f64_v2i64(<2 x double> inreg %f) {
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[0:1], s[16:17]
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[2:3], s[18:19]
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0x43dfffff
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v9, 0xc3e00000
 ; GFX7-ISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s7, 0xc1f00000
-; GFX7-ISEL-NEXT:    v_cmp_nge_f64_e32 vcc, s[16:17], v[8:9]
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[4:5], v[0:1], s4
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[6:7], v[2:3], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, -1
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0x43dfffff
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v11, s5
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v10, s4
+; GFX7-ISEL-NEXT:    v_cmp_nge_f64_e32 vcc, s[16:17], v[8:9]
 ; GFX7-ISEL-NEXT:    v_cmp_nge_f64_e64 s[4:5], s[18:19], v[8:9]
 ; GFX7-ISEL-NEXT:    v_cmp_gt_f64_e64 s[8:9], s[18:19], v[10:11]
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[4:5], v[4:5]
@@ -2239,17 +2238,17 @@ define <2 x i64> @test_s_signed_v2f64_v2i64(<2 x double> inreg %f) {
 ; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], s[16:17]
 ; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], s[18:19]
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    s_mov_b32 s5, 0x43dfffff
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xc3e00000
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xc1f00000
-; GFX9-NEXT:    v_cmp_nge_f64_e32 vcc, s[16:17], v[8:9]
 ; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[0:1], s4
 ; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[2:3], s4
 ; GFX9-NEXT:    s_mov_b32 s4, -1
-; GFX9-NEXT:    s_mov_b32 s5, 0x43dfffff
 ; GFX9-NEXT:    v_mov_b32_e32 v11, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-NEXT:    v_cmp_nge_f64_e32 vcc, s[16:17], v[8:9]
 ; GFX9-NEXT:    v_cmp_nge_f64_e64 s[4:5], s[18:19], v[8:9]
 ; GFX9-NEXT:    v_cmp_gt_f64_e64 s[8:9], s[18:19], v[10:11]
 ; GFX9-NEXT:    v_floor_f64_e32 v[4:5], v[4:5]
@@ -2377,9 +2376,8 @@ define <2 x i64> @test_s_signed_v2f64_v2i64(<2 x double> inreg %f) {
 ; GFX12-GI-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[4:5], v[0:1]
 ; GFX12-GI-NEXT:    v_fma_f64 v[2:3], 0xc1f00000, v[6:7], v[2:3]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v8, v[0:1]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v2, v[2:3]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v1, 0x43dfffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0x43dfffff
 ; GFX12-GI-NEXT:    v_cvt_i32_f64_e32 v3, v[4:5]
 ; GFX12-GI-NEXT:    v_cvt_i32_f64_e32 v4, v[6:7]
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e64 s4, s[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AMDGPU/fptoui-sat-scalar.ll
index 4a8378c50414d..da8cb932953b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui-sat-scalar.ll
@@ -723,10 +723,10 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; GFX7-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    v_cmp_nle_f64_e32 vcc, 0, v[0:1]
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[4:5], v[2:3], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, 0
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[4:5], v[4:5]
 ; GFX7-ISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], s[4:5], v[2:3]
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, -1
@@ -768,10 +768,10 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    v_cmp_nle_f64_e32 vcc, 0, v[0:1]
 ; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[2:3], s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    v_floor_f64_e32 v[4:5], v[4:5]
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[4:5], s[4:5], v[2:3]
 ; GFX9-NEXT:    s_mov_b32 s4, -1
@@ -844,8 +844,7 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; GFX12-GI-NEXT:    v_fma_f64 v[2:3], 0xc1f00000, v[4:5], v[2:3]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v4, v[4:5]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v6, v[2:3]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v2, -1
-; GFX12-GI-NEXT:    v_mov_b32_e32 v3, 0x43efffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v2, -1 :: v_dual_mov_b32 v3, 0x43efffff
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e64 s0, v[0:1], v[2:3]
 ; GFX12-GI-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX12-GI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc_lo
@@ -1004,9 +1003,9 @@ define i64 @test_s_unsigned_i64_f64(double inreg %f) nounwind {
 ; GFX7-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[0:1], s[16:17]
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, 0
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX7-ISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
 ; GFX7-ISEL-NEXT:    v_cvt_u32_f64_e32 v4, v[2:3]
@@ -1049,9 +1048,9 @@ define i64 @test_s_unsigned_i64_f64(double inreg %f) nounwind {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], s[16:17]
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cvt_u32_f64_e32 v4, v[2:3]
@@ -1124,8 +1123,7 @@ define i64 @test_s_unsigned_i64_f64(double inreg %f) nounwind {
 ; GFX12-GI-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v2, v[2:3]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v4, v[0:1]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v0, -1
-; GFX12-GI-NEXT:    v_mov_b32_e32 v1, 0x43efffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0x43efffff
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e32 vcc_lo, s[0:1], v[0:1]
 ; GFX12-GI-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GFX12-GI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll
index ab7fd327e2c58..758cec4632bd6 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll
@@ -1584,6 +1584,7 @@ define <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) {
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[4:5], v[0:1]
 ; GFX7-ISEL-NEXT:    v_trunc_f64_e32 v[6:7], v[2:3]
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    s_mov_b32 s8, -1
 ; GFX7-ISEL-NEXT:    s_mov_b32 s9, 0x43efffff
 ; GFX7-ISEL-NEXT:    v_cmp_nle_f64_e64 s[6:7], 0, v[2:3]
@@ -1591,7 +1592,6 @@ define <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) {
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[8:9], v[4:5], s4
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[10:11], v[6:7], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, 0
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[8:9], v[8:9]
 ; GFX7-ISEL-NEXT:    v_floor_f64_e32 v[10:11], v[10:11]
 ; GFX7-ISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], s[4:5], v[4:5]
@@ -1653,6 +1653,7 @@ define <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) {
 ; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[0:1]
 ; GFX9-NEXT:    v_trunc_f64_e32 v[6:7], v[2:3]
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    s_mov_b32 s8, -1
 ; GFX9-NEXT:    s_mov_b32 s9, 0x43efffff
 ; GFX9-NEXT:    v_cmp_nle_f64_e64 s[6:7], 0, v[2:3]
@@ -1660,7 +1661,6 @@ define <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) {
 ; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[4:5], s4
 ; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[6:7], s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-NEXT:    v_floor_f64_e32 v[8:9], v[8:9]
 ; GFX9-NEXT:    v_floor_f64_e32 v[10:11], v[10:11]
 ; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], s[4:5], v[4:5]
@@ -1768,9 +1768,8 @@ define <2 x i64> @test_unsigned_v2f64_v2i64(<2 x double> %f) {
 ; GFX12-GI-NEXT:    v_fma_f64 v[4:5], 0xc1f00000, v[8:9], v[4:5]
 ; GFX12-GI-NEXT:    v_fma_f64 v[6:7], 0xc1f00000, v[10:11], v[6:7]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v12, v[4:5]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v4, -1
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v6, v[6:7]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v5, 0x43efffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v5, 0x43efffff
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v7, v[8:9]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v8, v[10:11]
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e64 s1, v[0:1], v[4:5]
@@ -2007,11 +2006,11 @@ define <2 x i64> @test_s_unsigned_v2f64_v2i64(<2 x double> inreg %f) {
 ; GFX7-ISEL-NEXT:    s_movk_i32 s4, 0xffe0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX7-ISEL-NEXT:    s_mov_b32 s7, 0xc1f00000
+; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0x43efffff
 ; GFX7-ISEL-NEXT:    v_cmp_nge_f64_e64 s[8:9], s[18:19], 0
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[4:5], v[0:1], s4
 ; GFX7-ISEL-NEXT:    v_ldexp_f64 v[6:7], v[2:3], s4
 ; GFX7-ISEL-NEXT:    s_mov_b32 s4, -1
-; GFX7-ISEL-NEXT:    s_mov_b32 s5, 0x43efffff
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX7-ISEL-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX7-ISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[16:17], v[8:9]
@@ -2078,11 +2077,11 @@ define <2 x i64> @test_s_unsigned_v2f64_v2i64(<2 x double> inreg %f) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
 ; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xc1f00000
+; GFX9-NEXT:    s_mov_b32 s5, 0x43efffff
 ; GFX9-NEXT:    v_cmp_nge_f64_e64 s[8:9], s[18:19], 0
 ; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[0:1], s4
 ; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[2:3], s4
 ; GFX9-NEXT:    s_mov_b32 s4, -1
-; GFX9-NEXT:    s_mov_b32 s5, 0x43efffff
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, s[16:17], v[8:9]
@@ -2192,9 +2191,8 @@ define <2 x i64> @test_s_unsigned_v2f64_v2i64(<2 x double> inreg %f) {
 ; GFX12-GI-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[4:5], v[0:1]
 ; GFX12-GI-NEXT:    v_fma_f64 v[2:3], 0xc1f00000, v[6:7], v[2:3]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v8, v[0:1]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v2, v[2:3]
-; GFX12-GI-NEXT:    v_mov_b32_e32 v1, 0x43efffff
+; GFX12-GI-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, 0x43efffff
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v3, v[4:5]
 ; GFX12-GI-NEXT:    v_cvt_u32_f64_e32 v4, v[6:7]
 ; GFX12-GI-NEXT:    v_cmp_gt_f64_e32 vcc_lo, s[0:1], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index b5a5a930000e5..e3a5c408364f5 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -1702,8 +1702,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64 at abs32@lo
@@ -1959,14 +1958,14 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i64 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i64 at abs32@lo
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i64 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2097,13 +2096,13 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i64 at abs32@hi
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i64 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i64 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
@@ -3253,8 +3252,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
@@ -3286,8 +3284,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
@@ -3424,8 +3421,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8 at abs32@lo
@@ -3563,8 +3559,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i8 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i8 at abs32@lo
@@ -3708,8 +3703,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
@@ -3863,8 +3857,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
@@ -3946,8 +3939,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
@@ -4017,14 +4010,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i8 at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i8 at abs32@lo
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i8 at abs32@lo
 ; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -4088,9 +4080,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 16
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 16 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
@@ -4156,14 +4147,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
@@ -4308,8 +4298,7 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
@@ -4344,8 +4333,7 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
@@ -4506,8 +4494,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
@@ -4550,8 +4537,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
@@ -4654,8 +4640,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX9-NEXT:    s_mov_b32 s34, 0xc0c0004
+; GFX9-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s34
 ; GFX9-NEXT:    global_store_byte v[3:4], v2, off
@@ -4703,12 +4689,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -4729,8 +4715,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
@@ -4742,8 +4727,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[3:4], v2, off
@@ -4773,8 +4758,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
@@ -4786,18 +4770,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, 2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    global_store_b8 v[3:4], v2, off
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[40:41], v0, off
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
@@ -4835,12 +4819,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0004
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
@@ -4955,8 +4939,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
@@ -5000,8 +4983,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
@@ -5112,8 +5094,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    s_mov_b32 s34, 0xc0c0004
 ; GFX9-NEXT:    v_perm_b32 v5, v0, v1, s34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v3, s34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX9-NEXT:    global_store_byte v[0:1], v4, off
@@ -5192,8 +5174,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
@@ -5208,11 +5189,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v5, v0, v1, 0xc0c0004
 ; GFX11-TRUE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v4, off
@@ -5242,8 +5222,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
@@ -5258,11 +5237,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v0, v1, 0xc0c0004
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v3, 0xc0c0004
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 4
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
@@ -5451,8 +5429,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
@@ -5504,8 +5481,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-FAKE16-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
@@ -5617,8 +5593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
+; GFX9-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
 ; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
@@ -5724,14 +5700,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_mov_b32_e32 v40, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
+; GFX10-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX10-NEXT:    v_writelane_b32 v44, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
-; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret at abs32@lo
 ; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
 ; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
@@ -5831,9 +5806,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v43, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
-; GFX11-NEXT:    v_mov_b32_e32 v43, 0
+; GFX11-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v41, 0
+; GFX11-NEXT:    v_dual_mov_b32 v42, 16 :: v_dual_mov_b32 v43, 0
 ; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[40:41], off
@@ -5935,14 +5909,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
 ; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index a7a20db121623..c4707cf1a56eb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -556,9 +556,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -589,9 +589,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -5374,9 +5374,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -5407,9 +5407,9 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 21762ff4222a9..124f67a3787c9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -477,9 +477,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -490,9 +490,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -953,9 +953,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -965,9 +965,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -2169,9 +2169,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2182,9 +2182,9 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -2645,9 +2645,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2657,9 +2657,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -3497,9 +3497,9 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -3511,9 +3511,9 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -3996,9 +3996,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -4008,9 +4008,9 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index ea493405612d1..d524e952c5da2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -477,9 +477,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -490,9 +490,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
 ; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -953,9 +953,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -965,9 +965,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -2169,9 +2169,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2182,9 +2182,9 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
 ; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -2645,9 +2645,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2657,9 +2657,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -3497,9 +3497,9 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -3511,9 +3511,9 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -3996,9 +3996,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -4008,9 +4008,9 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 748971fa059c1..ef422fb1d61fe 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -656,9 +656,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX7-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -689,9 +689,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX6-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -2513,9 +2513,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX7-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -2546,9 +2546,9 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX6-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index d6e11e6af6d84..8284bb5d9e99b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1205,8 +1205,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -1425,8 +1425,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -2432,8 +2432,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -2682,8 +2682,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -4502,8 +4502,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -4752,8 +4752,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -6320,8 +6320,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -6845,9 +6845,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
@@ -6929,8 +6929,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -7014,15 +7014,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -7099,13 +7099,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -7198,8 +7198,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -7455,8 +7455,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -7761,8 +7761,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB12_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -8286,9 +8286,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_3
@@ -8370,8 +8370,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -8455,15 +8455,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -8540,13 +8540,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -8639,8 +8639,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -8896,8 +8896,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -9202,8 +9202,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -9727,9 +9727,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB14_3
@@ -9811,8 +9811,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -9896,15 +9896,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -9981,13 +9981,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -10125,8 +10125,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -10650,9 +10650,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB15_3
@@ -10734,8 +10734,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -10819,15 +10819,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -10904,13 +10904,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -11003,8 +11003,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -11260,8 +11260,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -11566,8 +11566,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -12091,9 +12091,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB17_3
@@ -12175,8 +12175,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -12260,15 +12260,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -12345,13 +12345,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 957ff4766e709..c0aa29126ae4a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3605,8 +3605,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3857,8 +3857,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    ; implicit-def: $sgpr15
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4143,9 +4142,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_2
@@ -4220,8 +4219,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4302,15 +4301,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -4398,13 +4397,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4883,8 +4882,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5135,8 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    ; implicit-def: $sgpr15
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5421,9 +5419,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_2
@@ -5498,8 +5496,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5580,15 +5578,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -5676,13 +5674,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6161,8 +6159,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -6413,8 +6411,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    ; implicit-def: $sgpr15
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -6699,9 +6696,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_2
@@ -6776,8 +6773,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6858,15 +6855,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -6954,13 +6951,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 97659df4f6496..053e4f619853b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3605,8 +3605,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3857,8 +3857,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    ; implicit-def: $sgpr15
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4143,9 +4142,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_2
@@ -4220,8 +4219,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4302,15 +4301,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -4398,13 +4397,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -4883,8 +4882,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5135,8 +5134,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    ; implicit-def: $sgpr15
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5421,9 +5419,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_2
@@ -5498,8 +5496,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5580,15 +5578,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -5676,13 +5674,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -6161,8 +6159,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -6413,8 +6411,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    ; implicit-def: $sgpr15
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1132-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -6699,9 +6696,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_2
@@ -6776,8 +6773,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6858,15 +6855,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -6954,13 +6951,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 68e87b16c66fe..23515ffcfb139 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1317,8 +1317,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -1567,8 +1567,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -2656,8 +2656,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -2906,8 +2906,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -4830,8 +4830,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -5080,8 +5080,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -6648,8 +6648,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -7173,9 +7173,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
@@ -7257,8 +7257,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -7342,15 +7342,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -7427,13 +7427,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -7526,8 +7526,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -7783,8 +7783,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -8088,8 +8088,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB12_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -8613,9 +8613,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_3
@@ -8697,8 +8697,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -8782,15 +8782,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -8867,13 +8867,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -8966,8 +8966,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -9223,8 +9223,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -9529,8 +9529,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -10054,9 +10054,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB14_3
@@ -10138,8 +10138,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -10223,15 +10223,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB14_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -10308,13 +10308,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -10452,8 +10452,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -10977,9 +10977,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB15_3
@@ -11061,8 +11061,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -11146,15 +11146,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB15_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -11231,13 +11231,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
@@ -11329,8 +11329,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -11586,8 +11586,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB16_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX9-DPP-NEXT:    s_mov_b32 s1, 0x43300000
 ; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[0:1], v[0:1]
@@ -11892,8 +11892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -12417,9 +12417,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB17_3
@@ -12501,8 +12501,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
@@ -12586,15 +12586,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB17_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
@@ -12671,13 +12671,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_add_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1132-DPP-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, v3
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v6
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
index 4d8fb8db624f8..029529671781f 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
@@ -120,18 +120,18 @@ body:             |
     ; CHECK-LABEL: name: av_mov_b64_split
     ; CHECK: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $vgpr0, $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
-    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec, implicit-def $agpr2_agpr3
-    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr2_agpr3
-    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr4_agpr5
-    ; CHECK-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr4_agpr5
-    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 3, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr1
+    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec, implicit-def $agpr2
+    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr3
+    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr4
+    ; CHECK-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 3, implicit $exec, implicit-def $vgpr0
+    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
     ; CHECK-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
     ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
-    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec, implicit-def $vgpr0_vgpr1
-    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec, implicit-def $vgpr0
+    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
     ; CHECK-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
     ; CHECK-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index cfa03402ef048..30e4db85c3f39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -997,13 +997,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1018,13 +1018,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1071,8 +1071,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -1088,8 +1088,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -1375,8 +1375,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s8, s2
 ; CI-NEXT:    s_mov_b32 s9, s3
@@ -1395,8 +1395,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
@@ -1433,8 +1433,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s8, s2
 ; CI-NEXT:    s_mov_b32 s9, s3
@@ -1453,8 +1453,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
@@ -1490,8 +1490,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) no
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1503,8 +1503,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) no
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1532,8 +1532,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1545,8 +1545,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1577,11 +1577,11 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; CI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[8:11], 0 addr64 offset:40 glc
@@ -1595,13 +1595,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1637,10 +1637,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1657,8 +1657,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
@@ -1690,8 +1690,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1713,8 +1713,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1735,8 +1735,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index cccd2449c3f01..2eeb47dc2f133 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -737,8 +737,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s8, s2
 ; CI-NEXT:    s_mov_b32 s9, s3
@@ -757,8 +757,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
@@ -795,8 +795,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s10, s6
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s8, s2
 ; CI-NEXT:    s_mov_b32 s9, s3
@@ -815,8 +815,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
@@ -852,8 +852,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
 ; CI-LABEL: global_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -865,8 +865,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
 ; VI-LABEL: global_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -894,8 +894,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -907,8 +907,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -939,11 +939,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; CI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[8:11], 0 addr64 offset:40 glc
@@ -957,13 +957,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -999,10 +999,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1019,8 +1019,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
@@ -1362,8 +1362,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 9
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1385,8 +1385,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 9
 ; VI-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1407,8 +1407,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ;
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1635,13 +1635,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1656,13 +1656,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1709,8 +1709,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
@@ -1726,8 +1726,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 0ba62e49cabc3..c34200aa83f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -95,8 +95,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfl
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
 ; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -106,8 +105,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfl
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -140,8 +138,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl
 ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4:
 ; GFX12-SDAG:       ; %bb.0: ; %main_body
 ; GFX12-SDAG-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -151,8 +148,7 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
index 5a27a72de274d..4131ac744945e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
@@ -30,11 +30,11 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
 ; GISEL12-NEXT:    v_cmp_ne_u32_e64 s[12:13], 0, v0
 ; GISEL12-NEXT:    s_wait_alu depctr_va_sdst(0)
 ; GISEL12-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GISEL12-NEXT:    v_mov_b32_e32 v1, s13
 ; GISEL12-NEXT:    s_mov_b64 exec, s[10:11]
-; GISEL12-NEXT:    v_mov_b32_e32 v11, v0
 ; GISEL12-NEXT:    v_add_nc_u32_e32 v10, 42, v13
+; GISEL12-NEXT:    v_mov_b32_e32 v11, v0
 ; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GISEL12-NEXT:    v_mov_b32_e32 v12, v1
 ; GISEL12-NEXT:  .LBB0_2: ; %tail
@@ -89,8 +89,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
 ; GISEL10-NEXT:    v_mov_b32_e32 v0, s12
 ; GISEL10-NEXT:    v_mov_b32_e32 v1, s13
 ; GISEL10-NEXT:    s_mov_b64 exec, s[10:11]
-; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
 ; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v13
+; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
 ; GISEL10-NEXT:    v_mov_b32_e32 v12, v1
 ; GISEL10-NEXT:  .LBB0_2: ; %tail
 ; GISEL10-NEXT:    s_or_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index ee6feabcc1570..7489d56e68e6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -129,8 +129,8 @@ define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) {
 ; GFX11-GISEL-LABEL: test_i_i64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x63
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -151,8 +151,8 @@ define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) {
 ; GFX11-GISEL-LABEL: test_i_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40934a00
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index b9bf76c1423b6..eabe746e8b38e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -178,14 +178,14 @@ define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrsp
 ; GFX11-GISEL-LABEL: test_scc_quadmask_32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 1
 ; GFX11-GISEL-NEXT:    s_quadmask_b32 s1, s1
 ; GFX11-GISEL-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, s1
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX11-GISEL-NEXT:    global_store_b32 v2, v3, s[2:3]
 ; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -193,14 +193,13 @@ define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrsp
 ; GFX11-SDAG-LABEL: test_scc_quadmask_32:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_quadmask_b32 s1, s1
 ; GFX11-SDAG-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
 ; GFX11-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX11-SDAG-NEXT:    global_store_b32 v2, v3, s[2:3]
 ; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
@@ -221,6 +220,7 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
 ; GFX11-GISEL-NEXT:    s_and_b32 s4, s4, 1
@@ -229,7 +229,6 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
 ; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -239,7 +238,7 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_and_b32 s4, s6, 1
 ; GFX11-SDAG-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
@@ -247,7 +246,6 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX11-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
 ; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v5, off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 3eaed5cf45e07..e245feb68f587 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -225,9 +225,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 32
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -258,9 +258,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; CHECK-GISEL-NEXT:    s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40400000
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f6e3b0ed78b20..f77bb0f786496 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -1802,8 +1802,8 @@ define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
 ; GFX1132GISEL-LABEL: const_value_i64:
 ; GFX1132GISEL:       ; %bb.0: ; %entry
 ; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index 5c97fd5875ad8..b87322ef9a0cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -80,12 +80,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GISEL11-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s0
-; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s0
 ; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
-; GISEL11-NEXT:    v_mov_b32_e32 v2, v0
-; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GISEL11-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
 ; GISEL11-NEXT:    global_store_b64 v[8:9], v[2:3], off
 ; GISEL11-NEXT:    s_endpgm
 ;
@@ -101,9 +99,7 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s0
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
-; DAGISEL11-NEXT:    v_mov_b32_e32 v3, v1
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; DAGISEL11-NEXT:    v_mov_b32_e32 v4, v2
+; DAGISEL11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
 ; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[3:4], off
 ; DAGISEL11-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 3fb6749884f1e..ce03681f2eca8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -71,11 +71,11 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN-LABEL: set_inactive_imm_poison_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.f64.ll
index 4c922a951523d..53f404c94128d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.f64.ll
@@ -30,9 +30,9 @@ define double @v_exp_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -63,8 +63,8 @@ define double @v_exp_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -99,8 +99,8 @@ define double @v_exp_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -134,8 +134,8 @@ define double @v_exp_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -404,8 +404,8 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0xbfe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[4:5], s[8:9], v[0:1]
-; SI-SDAG-NEXT:    s_mov_b32 s10, 0x3b39803f
 ; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[2:3], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[4:5], s[10:11], v[7:8]
 ; SI-SDAG-NEXT:    v_bfi_b32 v7, s26, v16, v15
@@ -414,10 +414,10 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f64 v[6:7], v[16:17], -v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s12, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x3e928af3
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v10, s12
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, s13
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[8:9], v[2:3]
@@ -460,9 +460,9 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], s[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], 1.0
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v6, v[6:7]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v16
@@ -493,11 +493,11 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_add_f64 v[10:11], v[4:5], v[8:9]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x432fffff
 ; SI-GISEL-NEXT:    v_add_f64 v[9:10], v[10:11], -v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, -1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x432fffff
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[4:5]|, v[11:12]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v7
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
@@ -516,8 +516,8 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[12:13], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[8:9], v[12:13], v[14:15]
@@ -556,8 +556,8 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[16:17], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], 1.0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x40900000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[14:15]
 ; SI-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[4:5], v[0:1], v[16:17]
@@ -589,15 +589,15 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], s[4:5]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0xbfe62e42
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s7
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v12, s6
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s6, 0
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x40900000
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[6:7], v[0:1]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[6:7], s[6:7], v[2:3]
@@ -755,15 +755,15 @@ define <2 x double> @v_exp_v2f64(<2 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], s[4:5]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0xbfe62e42
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v12, s6
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x40900000
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[6:7], v[0:1]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[6:7], s[6:7], v[2:3]
@@ -937,12 +937,12 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0xbfe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[11:12], s[10:11], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s12, 0x3b39803f
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0xbc7abc9e
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[13:14], v[11:12], s[12:13], v[7:8]
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s5
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-SDAG-NEXT:    v_fma_f64 v[15:16], v[13:14], s[14:15], v[8:9]
@@ -975,8 +975,8 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s40, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[11:12], v[13:14], v[15:16], 1.0
 ; SI-SDAG-NEXT:    v_mul_f64 v[13:14], v[2:3], s[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s44, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s41, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s44, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s45, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[11:12], v[11:12], v7
 ; SI-SDAG-NEXT:    v_bfi_b32 v7, s46, v10, v14
@@ -1067,13 +1067,13 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[12:13], v[20:21], v[14:15]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3e928af3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v7, 0x80000000, v17
 ; SI-GISEL-NEXT:    v_fma_f64 v[28:29], v[14:15], v[24:25], v[26:27]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x623fde64
 ; SI-GISEL-NEXT:    v_or_b32_e32 v7, 0x43300000, v7
+; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3ec71dee
 ; SI-GISEL-NEXT:    v_add_f64 v[22:23], v[16:17], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[28:29], v[14:15], v[28:29], v[30:31]
@@ -1089,21 +1089,21 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[28:29], v[34:35]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[22:23], v[28:29]
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v37, 0x3f811111
-; SI-GISEL-NEXT:    v_and_b32_e32 v7, 0x80000000, v11
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[22:23], v[36:37]
+; SI-GISEL-NEXT:    v_and_b32_e32 v7, 0x80000000, v11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x555502a1
-; SI-GISEL-NEXT:    v_or_b32_e32 v7, 0x43300000, v7
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v39, 0x3fa55555
-; SI-GISEL-NEXT:    v_add_f64 v[48:49], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_or_b32_e32 v7, 0x43300000, v7
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[22:23], v[38:39]
+; SI-GISEL-NEXT:    v_add_f64 v[48:49], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v50, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v51, 0x3fc55555
-; SI-GISEL-NEXT:    v_add_f64 v[6:7], v[48:49], -v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[22:23], v[50:51]
+; SI-GISEL-NEXT:    v_add_f64 v[6:7], v[48:49], -v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v49, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[22:23], v[48:49]
@@ -1130,16 +1130,16 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[22:23], v[12:13], v[38:39]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[14:15], v[8:9], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[22:23], v[12:13], v[50:51]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[24:25], v[36:37]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[22:23], v[12:13], v[48:49]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v26
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], v[38:39]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[22:23], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v16, v[16:17]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0xc090cc00
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
@@ -1180,11 +1180,11 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], s[4:5]
 ; VI-SDAG-NEXT:    s_mov_b32 s8, 0x3b39803f
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0xbc7abc9e
-; VI-SDAG-NEXT:    s_mov_b32 s10, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s11, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s10, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s11
+; VI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
 ; VI-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v12, s10
@@ -1355,8 +1355,8 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_cvt_i32_f64_e32 v19, v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[14:15], v20
+; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[14:15], v[16:17], v18
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v19
@@ -1392,11 +1392,11 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], s[4:5]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x3b39803f
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0xbc7abc9e
-; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s11, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s11
+; GFX900-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v12, s10
@@ -1567,8 +1567,8 @@ define <3 x double> @v_exp_v3f64(<3 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f64_e32 v19, v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[14:15], v20
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[14:15], v[16:17], v18
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v19
@@ -1632,16 +1632,16 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[16:17], v[10:11], s[10:11], v[0:1]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0xbc7abc9e
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[19:20], v[10:11], s[12:13], v[16:17]
-; SI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v17, s5
+; SI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v16, s4
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0x623fde64
 ; SI-SDAG-NEXT:    v_fma_f64 v[21:22], v[19:20], s[16:17], v[16:17]
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0x623fde64
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3ec71dee
 ; SI-SDAG-NEXT:    v_fma_f64 v[21:22], v[19:20], v[21:22], s[14:15]
 ; SI-SDAG-NEXT:    s_mov_b32 s18, 0x7c89e6b0
@@ -1668,8 +1668,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v9, v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[21:22], v[19:20], v[21:22], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s28, 0
-; SI-SDAG-NEXT:    s_mov_b32 s26, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s29, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s26, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s27, 0xc090cc00
 ; SI-SDAG-NEXT:    v_fma_f64 v[19:20], v[19:20], v[21:22], 1.0
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[28:29], v[0:1]
@@ -1763,9 +1763,9 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_add_f64 v[16:17], v[14:15], v[8:9]
 ; SI-GISEL-NEXT:    v_mul_f64 v[18:19], v[2:3], v[12:13]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, -1
 ; SI-GISEL-NEXT:    v_add_f64 v[16:17], v[16:17], -v[8:9]
 ; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v19
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, -1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x432fffff
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[14:15]|, v[10:11]
@@ -1791,36 +1791,36 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], -v[16:17], v[26:27], v[24:25]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[20:21], v[24:25], v[28:29]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v33, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mul_f64 v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[20:21], v[30:31], v[32:33]
+; SI-GISEL-NEXT:    v_mul_f64 v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v35, 0x3efa0199
-; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[20:21], v[30:31], v[34:35]
+; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v13
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 0x14761f6e
-; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v37, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_add_f64 v[38:39], v[12:13], v[8:9]
+; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[20:21], v[30:31], v[36:37]
+; SI-GISEL-NEXT:    v_add_f64 v[38:39], v[12:13], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v49, 0x3f56c16c
-; SI-GISEL-NEXT:    v_add_f64 v[8:9], v[38:39], -v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[20:21], v[30:31], v[48:49]
+; SI-GISEL-NEXT:    v_add_f64 v[8:9], v[38:39], -v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v39, 0x3f811111
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[12:13]|, v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[20:21], v[30:31], v[38:39]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3fa55555
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[20:21], v[10:11], v[30:31]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[20:21], v[10:11], v[30:31]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3fc55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[20:21], v[10:11], v[12:13]
@@ -1856,9 +1856,9 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], v[34:35]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[22:23], v[24:25], v[34:35]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], v[36:37]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], v[48:49]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[22:23], v[24:25], v[36:37]
+; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], v[48:49]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], v[38:39]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x40900000
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], v[30:31]
@@ -1873,11 +1873,11 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[22:23], v[24:25], v[30:31]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[14:15], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff00000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[22:23], v[24:25], v[12:13]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v19, v18, v11, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[10:11], v[14:15], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[4:5], v[28:29]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0xc090cc00
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[22:23], v[12:13], v[50:51]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
@@ -1917,8 +1917,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s19, 0xbfe62e42
 ; VI-SDAG-NEXT:    s_mov_b32 s20, 0x3b39803f
 ; VI-SDAG-NEXT:    s_mov_b32 s21, 0xbc7abc9e
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s22, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[10:11], v[8:9]
 ; VI-SDAG-NEXT:    s_mov_b32 s23, 0x3e5ade15
@@ -1945,8 +1945,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[10:11]
 ; VI-SDAG-NEXT:    s_mov_b32 s58, 0
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], v[12:13], s[22:23], v[8:9]
-; VI-SDAG-NEXT:    s_mov_b32 s60, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s59, 0x40900000
+; VI-SDAG-NEXT:    s_mov_b32 s60, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s61, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[58:59], v[0:1]
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[60:61], v[0:1]
@@ -2063,8 +2063,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[26:27], v[22:23]
 ; VI-GISEL-NEXT:    v_fma_f64 v[24:25], -v[8:9], v[26:27], v[24:25]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[28:29], v[18:19], v[20:21], v[26:27]
 ; VI-GISEL-NEXT:    v_fma_f64 v[30:31], v[16:17], v[20:21], v[26:27]
@@ -2175,8 +2175,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s19, 0xbfe62e42
 ; GFX900-SDAG-NEXT:    s_mov_b32 s20, 0x3b39803f
 ; GFX900-SDAG-NEXT:    s_mov_b32 s21, 0xbc7abc9e
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s22, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[10:11], v[8:9]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s23, 0x3e5ade15
@@ -2203,8 +2203,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[10:11]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s58, 0
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[14:15], v[12:13], s[22:23], v[8:9]
-; GFX900-SDAG-NEXT:    s_mov_b32 s60, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s59, 0x40900000
+; GFX900-SDAG-NEXT:    s_mov_b32 s60, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s61, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[58:59], v[0:1]
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[60:61], v[0:1]
@@ -2321,8 +2321,8 @@ define <4 x double> @v_exp_v4f64(<4 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[26:27], v[22:23]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[24:25], -v[8:9], v[26:27], v[24:25]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v20, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[28:29], v[18:19], v[20:21], v[26:27]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[30:31], v[16:17], v[20:21], v[26:27]
@@ -2451,9 +2451,9 @@ define amdgpu_ps <2 x i32> @s_exp_f64(double inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[0:1], s[2:3], v[2:3]
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -2484,8 +2484,8 @@ define amdgpu_ps <2 x i32> @s_exp_f64(double inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40900000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
@@ -2521,8 +2521,8 @@ define amdgpu_ps <2 x i32> @s_exp_f64(double inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[0:1], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -2556,8 +2556,8 @@ define amdgpu_ps <2 x i32> @s_exp_f64(double inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
@@ -2573,8 +2573,8 @@ define amdgpu_ps <2 x i32> @s_exp_f64(double inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x652b82fe
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3ff71547
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s3, 0xbfe62e42
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
@@ -2695,8 +2695,8 @@ define amdgpu_ps <2 x i32> @s_exp_f64(double inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x652b82fe
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3ff71547
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0xbfe62e42
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
@@ -2840,8 +2840,8 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xbfe62e42
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[2:3], s[6:7], v[5:6]
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3b39803f
 ; SI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[2:3], s[8:9], v[5:6]
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s26, v12, v1
@@ -2855,8 +2855,8 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[6:7], v[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3e928af3
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s10
+; SI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s11
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[8:9], v[4:5]
@@ -2900,8 +2900,8 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v13, v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], 1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x40900000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v12
@@ -2948,17 +2948,17 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe62e42
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[10:11], s[0:1]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[10:11], s[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3b39803f
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[10:11], s[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c7abc9e
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[10:11], s[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x6a5dcb37
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e5ade15
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[4:5], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e5ade15
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[10:11], v[8:9]
@@ -2996,14 +2996,14 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[8:9], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[16:17]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[16:17]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[8:9]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v13, v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v12, v10, v7, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
@@ -3033,16 +3033,16 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-SDAG-NEXT:    v_mul_f64 v[2:3], s[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s1
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0xbfe62e42
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, s7
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
@@ -3097,8 +3097,8 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[4:5], v2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v3
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40900000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[4:5]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[8:9], s[2:3], v[6:7]
@@ -3139,6 +3139,7 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
 ; VI-GISEL-NEXT:    v_cvt_i32_f64_e32 v2, v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[8:9], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[10:11]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x623fde64
@@ -3180,7 +3181,6 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_cvt_i32_f64_e32 v8, v[0:1]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -3211,16 +3211,16 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[2:3], s[2:3], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s1
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0xbfe62e42
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
@@ -3275,8 +3275,8 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[4:5], v2
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40900000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[4:5]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[8:9], s[2:3], v[6:7]
@@ -3317,6 +3317,7 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f64_e32 v2, v[2:3]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[8:9], v[10:11]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[10:11]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0x623fde64
@@ -3358,7 +3359,6 @@ define amdgpu_ps <4 x i32> @s_exp_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f64_e32 v8, v[0:1]
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[2:3], v[2:3]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -3410,12 +3410,12 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[2:3], s[8:9], v[5:6]
 ; SI-SDAG-NEXT:    s_mov_b32 s12, 0x3b39803f
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0xbc7abc9e
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[2:3], s[12:13], v[5:6]
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s7
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, s6
 ; SI-SDAG-NEXT:    v_fma_f64 v[9:10], v[5:6], s[14:15], v[7:8]
@@ -3545,35 +3545,35 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-GISEL-NEXT:    v_and_b32_e32 v5, 0x80000000, v7
 ; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
-; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe62e42
+; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], s[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x3b39803f
+; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[14:15], -v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[6:7]|, v[8:9]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[16:17], v[12:13]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3e928af3
-; SI-GISEL-NEXT:    v_mul_f64 v[0:1], s[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[12:13], v[14:15], v[18:19]
+; SI-GISEL-NEXT:    v_mul_f64 v[0:1], s[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0x3ec71dee
-; SI-GISEL-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[12:13], v[20:21], v[22:23]
+; SI-GISEL-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x7c89e6b0
-; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3efa0199
-; SI-GISEL-NEXT:    v_add_f64 v[26:27], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[12:13], v[20:21], v[24:25]
+; SI-GISEL-NEXT:    v_add_f64 v[26:27], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[26:27], -v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[12:13], v[20:21], v[28:29]
+; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[26:27], -v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[8:9]
@@ -3607,23 +3607,23 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[28:29]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], v[28:29]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[26:27]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], v[26:27]
-; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[20:21]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v18, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[20:21]
+; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[30:31]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v18
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[30:31]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[32:33]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[12:13], v[16:17], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[10:11], v[14:15], v[20:21]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x40900000
+; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[14:15]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v17, v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[12:13], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
-; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[14:15]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x7ff00000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
@@ -3666,9 +3666,9 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[2:3], s[4:5], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfefa39ef
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, s3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0xbfe62e42
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
@@ -3677,8 +3677,8 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v11, s1
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[2:3], s[6:7], v[6:7]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[4:5], s[6:7], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[0:1], s[6:7], v[10:11]
@@ -3746,8 +3746,8 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v14
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x40900000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[14:15], s[4:5], v[8:9]
@@ -3861,8 +3861,8 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[14:15]
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[14:15]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[6:7]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
@@ -3896,9 +3896,9 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[2:3], s[4:5], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, s3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0xbfe62e42
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
@@ -3907,8 +3907,8 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v11, s1
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[2:3], s[6:7], v[6:7]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[4:5], s[6:7], v[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[0:1], s[6:7], v[10:11]
@@ -3976,8 +3976,8 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v14
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v4
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x40900000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[14:15], s[4:5], v[8:9]
@@ -4091,8 +4091,8 @@ define amdgpu_ps <6 x i32> @s_exp_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[14:15]
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[14:15]
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[6:7]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
@@ -4145,8 +4145,8 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0xbfe62e42
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, s7
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[2:3], s[8:9], v[5:6]
-; SI-SDAG-NEXT:    s_mov_b32 s10, 0x3b39803f
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[2:3], s[10:11], v[5:6]
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s33, v19, v9
@@ -4159,11 +4159,11 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s17, 0x3e928af3
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s33, v19, v12
 ; SI-SDAG-NEXT:    v_add_f64 v[13:14], v[11:12], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v15, s16
 ; SI-SDAG-NEXT:    v_add_f64 v[13:14], v[13:14], -v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[11:12]|, s[12:13]
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v16, s17
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
@@ -4188,8 +4188,8 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s27, 0x3fa55555
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v18, s5
 ; SI-SDAG-NEXT:    v_fma_f64 v[13:14], v[6:7], v[13:14], s[26:27]
-; SI-SDAG-NEXT:    s_mov_b32 s28, 0x55555511
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v17, s4
+; SI-SDAG-NEXT:    s_mov_b32 s28, 0x55555511
 ; SI-SDAG-NEXT:    s_mov_b32 s29, 0x3fc55555
 ; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[8:9], s[8:9], v[17:18]
 ; SI-SDAG-NEXT:    v_fma_f64 v[13:14], v[6:7], v[13:14], s[28:29]
@@ -4226,9 +4226,9 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[19:20], v[4:5], v[19:20], s[16:17]
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[17:18], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[4:5], v[19:20], s[18:19]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v13, 0
-; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[4:5], v[17:18], s[20:21]
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[4:5], v[17:18], s[20:21]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[4:5], v[17:18], s[22:23]
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[6:7], v[13:14]
@@ -4248,9 +4248,9 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[15:16], v[9:10], v[15:16], s[16:17]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[7:8], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[9:10], v[15:16], s[18:19]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0
-; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[9:10], v[7:8], s[20:21]
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v11, v[11:12]
+; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[9:10], v[7:8], s[20:21]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[9:10], v[7:8], s[22:23]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v16, 0xc090cc00
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[9:10], v[7:8], s[24:25]
@@ -4331,28 +4331,28 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v15, v6, v15, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[18:19], v[16:17]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[6:7], v[16:17], v[20:21]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mul_f64 v[0:1], s[6:7], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[6:7], v[22:23], v[24:25]
+; SI-GISEL-NEXT:    v_mul_f64 v[0:1], s[6:7], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3efa0199
+; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[6:7], v[22:23], v[26:27]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v14, v5, v14, vcc
 ; SI-GISEL-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
-; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[6:7], v[22:23], v[26:27]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x14761f6e
-; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_add_f64 v[30:31], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[6:7], v[22:23], v[28:29]
+; SI-GISEL-NEXT:    v_add_f64 v[30:31], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v33, 0x3f56c16c
-; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[30:31], -v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[6:7], v[22:23], v[32:33]
+; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[30:31], -v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3f811111
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[8:9]
@@ -4463,8 +4463,8 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-SDAG-NEXT:    v_mul_f64 v[2:3], s[6:7], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfefa39ef
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s6
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0xbfe62e42
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s7
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], s[4:5], v[0:1]
@@ -4473,8 +4473,8 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    s_mov_b32 s14, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s15, 0x3e928af3
-; VI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v10, s14
+; VI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; VI-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v11, s15
@@ -4566,9 +4566,9 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[10:11], s[2:3], v[18:19]
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[12:13], v[10:11], 1.0
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v11, v[0:1]
+; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[16:17]
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[8:9], v21
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[16:17]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[6:7], s[6:7], v[8:9]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[14:15], s[0:1], v[8:9]
@@ -4745,8 +4745,8 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[2:3], s[6:7], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, s6
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0xbfe62e42
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[4:5], s[4:5], v[0:1]
@@ -4755,8 +4755,8 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s14, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s15, 0x3e928af3
-; GFX900-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v10, s14
+; GFX900-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v11, s15
@@ -4848,9 +4848,9 @@ define amdgpu_ps <8 x i32> @s_exp_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[10:11], s[2:3], v[18:19]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[2:3], v[12:13], v[10:11], 1.0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v11, v[0:1]
+; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[16:17]
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[8:9], v21
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[16:17]
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[6:7], s[6:7], v[8:9]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[14:15], s[0:1], v[8:9]
@@ -5048,9 +5048,9 @@ define double @v_exp_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5081,8 +5081,8 @@ define double @v_exp_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5117,8 +5117,8 @@ define double @v_exp_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5152,8 +5152,8 @@ define double @v_exp_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -5426,9 +5426,9 @@ define double @v_exp_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5459,8 +5459,8 @@ define double @v_exp_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xc0900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x4090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5495,8 +5495,8 @@ define double @v_exp_fneg_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5530,8 +5530,8 @@ define double @v_exp_fneg_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, -|v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -5805,9 +5805,9 @@ define double @v_exp_fneg_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5838,8 +5838,8 @@ define double @v_exp_fneg_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xc0900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x4090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
@@ -5874,8 +5874,8 @@ define double @v_exp_fneg_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5909,8 +5909,8 @@ define double @v_exp_fneg_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, -v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -6183,9 +6183,9 @@ define double @v_exp_f64_fast(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6246,8 +6246,8 @@ define double @v_exp_f64_fast(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6524,9 +6524,9 @@ define double @v_exp_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6557,8 +6557,8 @@ define double @v_exp_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -6593,8 +6593,8 @@ define double @v_exp_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6628,8 +6628,8 @@ define double @v_exp_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -6901,9 +6901,9 @@ define double @v_exp_f64_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6964,8 +6964,8 @@ define double @v_exp_f64_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7242,9 +7242,9 @@ define double @v_exp_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7275,8 +7275,8 @@ define double @v_exp_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -7311,8 +7311,8 @@ define double @v_exp_f64_nnan(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7346,8 +7346,8 @@ define double @v_exp_f64_nnan(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -7619,9 +7619,9 @@ define double @v_fabs_exp_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7652,8 +7652,8 @@ define double @v_fabs_exp_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -7688,8 +7688,8 @@ define double @v_fabs_exp_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7723,8 +7723,8 @@ define double @v_fabs_exp_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -7997,9 +7997,9 @@ define double @v_exp_f64_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8060,8 +8060,8 @@ define double @v_exp_f64_nnan_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8366,9 +8366,9 @@ define double @v_exp_f64_from_fpext_f16(half %src) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8399,8 +8399,8 @@ define double @v_exp_f64_from_fpext_f16(half %src) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -8435,34 +8435,34 @@ define double @v_exp_f64_from_fpext_f16(half %src) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[8:9], v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3c7abc9e
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[10:11], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[14:15], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[0:1]
@@ -8470,10 +8470,10 @@ define double @v_exp_f64_from_fpext_f16(half %src) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[2:3], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -8742,8 +8742,8 @@ define double @v_exp_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    s_brev_b32 s4, -2
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s4, v4, v3
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[6:7]
@@ -8755,9 +8755,9 @@ define double @v_exp_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8788,8 +8788,8 @@ define double @v_exp_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -8820,13 +8820,13 @@ define double @v_exp_f64_from_fpext_f32(float %src) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3b39803f
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[10:11], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9145,8 +9145,8 @@ define double @v_exp_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[8:9], v[2:3]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9177,8 +9177,8 @@ define double @v_exp_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[2:3]
@@ -9200,12 +9200,12 @@ define double @v_exp_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x432fffff
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff71547
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3b39803f
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x652b82fe
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff71547
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
@@ -9225,26 +9225,26 @@ define double @v_exp_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[14:15], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[0:1]
@@ -9252,10 +9252,10 @@ define double @v_exp_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[2:3], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -9541,9 +9541,9 @@ define double @v_exp_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9574,8 +9574,8 @@ define double @v_exp_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -9610,8 +9610,8 @@ define double @v_exp_f64_contract(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9645,8 +9645,8 @@ define double @v_exp_f64_contract(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -9918,9 +9918,9 @@ define double @v_exp_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xbc7abc9e
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9981,8 +9981,8 @@ define double @v_exp_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.f64.ll
index 9193927f18a3e..796a5010d3886 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.f64.ll
@@ -36,9 +36,9 @@ define double @v_exp10_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -69,8 +69,8 @@ define double @v_exp10_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -111,8 +111,8 @@ define double @v_exp10_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -146,8 +146,8 @@ define double @v_exp10_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -178,9 +178,9 @@ define double @v_exp10_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -242,8 +242,8 @@ define double @v_exp10_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -308,9 +308,9 @@ define double @v_exp10_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -372,8 +372,8 @@ define double @v_exp10_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -446,8 +446,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s12, 0x494ea3e9
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0xbcaf48ad
 ; SI-SDAG-NEXT:    v_mul_f64 v[9:10], v[7:8], s[12:13]
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0xbbb55516
 ; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[2:3], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[7:8], s[14:15], v[9:10]
 ; SI-SDAG-NEXT:    v_bfi_b32 v7, s28, v16, v15
@@ -460,8 +460,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[8:9], v[2:3]
 ; SI-SDAG-NEXT:    s_mov_b32 s17, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[10:11], v[14:15]
-; SI-SDAG-NEXT:    s_mov_b32 s18, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v10, s16
+; SI-SDAG-NEXT:    s_mov_b32 s18, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s19, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, s17
 ; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[14:15], s[12:13]
@@ -504,9 +504,9 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], s[8:9]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[14:15], v[4:5], 1.0
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v6, v[6:7]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v16
@@ -537,12 +537,11 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_add_f64 v[10:11], v[4:5], v[8:9]
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x432fffff
 ; SI-GISEL-NEXT:    v_add_f64 v[9:10], v[10:11], -v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, -1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x432fffff
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[4:5]|, v[11:12]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xbbb55516
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v7
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
@@ -563,13 +562,14 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x494ea3e9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0xbcaf48ad
 ; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[10:11], v[12:13]
-; SI-GISEL-NEXT:    v_mul_f64 v[12:13], v[8:9], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x40026bb1
+; SI-GISEL-NEXT:    v_mul_f64 v[12:13], v[8:9], v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[16:17], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[12:13], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[8:9], v[12:13], v[14:15]
@@ -608,8 +608,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[16:17], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], 1.0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x40900000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[14:15]
 ; SI-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[4:5], v[0:1], v[16:17]
@@ -641,8 +641,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], s[4:5]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x509f79ff
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0xbfd34413
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
@@ -663,8 +663,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[4:5], v[12:13]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[4:5], v[14:15]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s7
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v12, s6
 ; VI-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -752,8 +752,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], v[14:15]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[16:17], v[12:13]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[12:13], v[14:15]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], v[8:9], v[12:13], v[14:15]
@@ -823,8 +823,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], s[4:5]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x509f79ff
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0xbfd34413
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
@@ -845,8 +845,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[4:5], v[12:13]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[4:5], v[14:15]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s7
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v12, s6
 ; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -934,8 +934,8 @@ define <2 x double> @v_exp10_v2f64(<2 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], v[14:15]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[16:17], v[12:13]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[12:13], v[14:15]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[12:13], v[8:9], v[12:13], v[14:15]
@@ -1027,12 +1027,12 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s17, 0xbcaf48ad
 ; SI-SDAG-NEXT:    v_mul_f64 v[13:14], v[7:8], s[16:17]
 ; SI-SDAG-NEXT:    s_mov_b32 s18, 0xbbb55516
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s19, 0x40026bb1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[13:14], v[7:8], s[18:19], v[13:14]
-; SI-SDAG-NEXT:    s_mov_b32 s20, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s5
+; SI-SDAG-NEXT:    s_mov_b32 s20, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s21, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-SDAG-NEXT:    v_fma_f64 v[15:16], v[13:14], s[20:21], v[8:9]
@@ -1067,9 +1067,9 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_mul_f64 v[13:14], v[2:3], s[8:9]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[11:12], v[11:12], v7
 ; SI-SDAG-NEXT:    v_bfi_b32 v7, s60, v10, v14
-; SI-SDAG-NEXT:    s_mov_b32 s58, 0
 ; SI-SDAG-NEXT:    v_add_f64 v[15:16], v[13:14], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s47, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s58, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s59, 0xc090cc00
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[46:47], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[58:59], v[0:1]
@@ -1148,9 +1148,9 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v13, 0x43300000, v8
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, -1
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[14:15], -v[12:13]
 ; SI-GISEL-NEXT:    v_and_b32_e32 v13, 0x80000000, v17
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, -1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x432fffff
 ; SI-GISEL-NEXT:    v_or_b32_e32 v13, 0x43300000, v13
 ; SI-GISEL-NEXT:    v_add_f64 v[18:19], v[16:17], v[12:13]
@@ -1167,20 +1167,20 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[16:17], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xa994fd21
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0xbc49dc1d
-; SI-GISEL-NEXT:    v_and_b32_e32 v13, 0x80000000, v11
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[20:21], v[18:19]
+; SI-GISEL-NEXT:    v_and_b32_e32 v13, 0x80000000, v11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0x494ea3e9
-; SI-GISEL-NEXT:    v_or_b32_e32 v13, 0x43300000, v13
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0xbcaf48ad
-; SI-GISEL-NEXT:    v_add_f64 v[24:25], v[10:11], v[12:13]
+; SI-GISEL-NEXT:    v_or_b32_e32 v13, 0x43300000, v13
 ; SI-GISEL-NEXT:    v_mul_f64 v[26:27], v[18:19], v[22:23]
+; SI-GISEL-NEXT:    v_add_f64 v[24:25], v[10:11], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x40026bb1
-; SI-GISEL-NEXT:    v_add_f64 v[12:13], v[24:25], -v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[18:19], v[28:29], v[26:27]
+; SI-GISEL-NEXT:    v_add_f64 v[12:13], v[24:25], -v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3e928af3
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[10:11]|, v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[18:19], v[24:25], v[26:27]
@@ -1194,38 +1194,38 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[30:31], -v[6:7], v[20:21], v[30:31]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[20:21], v[16:17]
 ; SI-GISEL-NEXT:    v_mul_f64 v[48:49], v[30:31], v[22:23]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x7c89e6b0
-; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[30:31], v[28:29], v[48:49]
 ; SI-GISEL-NEXT:    v_mul_f64 v[22:23], v[16:17], v[22:23]
+; SI-GISEL-NEXT:    v_fma_f64 v[30:31], v[30:31], v[28:29], v[48:49]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3efa0199
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[12:13]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[28:29], v[22:23]
 ; SI-GISEL-NEXT:    v_fma_f64 v[28:29], v[30:31], v[24:25], v[26:27]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v35, 0x3f2a01a0
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[34:35]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[16:17], v[24:25], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[30:31], v[28:29], v[32:33]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v37, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[36:37]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[16:17], v[24:25], v[32:33]
 ; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[30:31], v[26:27], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v39, 0x3f811111
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[38:39]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v50, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[24:25], v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[30:31], v[26:27], v[34:35]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v50, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v51, 0x3fa55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[50:51]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[30:31], v[24:25], v[36:37]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3fc55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[20:21]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 11
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v26, v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[30:31], v[24:25], v[38:39]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v49, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[34:35]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], v[48:49]
@@ -1235,15 +1235,15 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[30:31], v[14:15], v[20:21]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[38:39]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[18:19], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[30:31], v[14:15], v[48:49]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x40900000
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[50:51]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v26
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[18:19]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[30:31], v[14:15], 1.0
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v6, v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0xc090cc00
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[12:13], v[20:21]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x7ff00000
@@ -1307,8 +1307,8 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s17, 0x3fa55555
 ; VI-SDAG-NEXT:    s_mov_b32 s18, 0x55555511
 ; VI-SDAG-NEXT:    v_fma_f64 v[18:19], v[12:13], s[4:5], v[18:19]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s19, 0x3fc55555
 ; VI-SDAG-NEXT:    v_mul_f64 v[16:17], v[10:11], s[8:9]
 ; VI-SDAG-NEXT:    s_mov_b32 s20, 11
@@ -1422,8 +1422,8 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[18:19], v[18:19], v[16:17], v[22:23]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[20:21]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[16:17], v[20:21]
 ; VI-GISEL-NEXT:    v_fma_f64 v[24:25], v[18:19], v[16:17], v[20:21]
@@ -1479,8 +1479,8 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_cvt_i32_f64_e32 v17, v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[14:15], v20
+; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[14:15], v[18:19], v16
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v17
@@ -1539,8 +1539,8 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s17, 0x3fa55555
 ; GFX900-SDAG-NEXT:    s_mov_b32 s18, 0x55555511
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[18:19], v[12:13], s[4:5], v[18:19]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s19, 0x3fc55555
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[16:17], v[10:11], s[8:9]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s20, 11
@@ -1654,8 +1654,8 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[18:19], v[18:19], v[16:17], v[22:23]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[16:17], v[20:21]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v16, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v17, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[22:23], v[14:15], v[16:17], v[20:21]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[24:25], v[18:19], v[16:17], v[20:21]
@@ -1711,8 +1711,8 @@ define <3 x double> @v_exp10_v3f64(<3 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f64_e32 v17, v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[14:15], v20
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[14:15], v[18:19], v16
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v17
@@ -1788,17 +1788,17 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[16:17], v[8:9], s[16:17], v[0:1]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v15, v15, v18, vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[16:17], v[8:9], s[14:15], v[16:17]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
-; SI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], s[20:21]
 ; SI-SDAG-NEXT:    s_mov_b32 s23, 0x40026bb1
+; SI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], s[20:21]
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[18:19], v[16:17], s[22:23], v[18:19]
-; SI-SDAG-NEXT:    s_mov_b32 s10, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v17, s5
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v16, s4
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x623fde64
 ; SI-SDAG-NEXT:    v_fma_f64 v[20:21], v[18:19], s[10:11], v[16:17]
+; SI-SDAG-NEXT:    s_mov_b32 s12, 0x623fde64
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x3ec71dee
 ; SI-SDAG-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], s[12:13]
 ; SI-SDAG-NEXT:    s_mov_b32 s18, 0x7c89e6b0
@@ -1824,10 +1824,10 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], s[56:57]
 ; SI-SDAG-NEXT:    s_mov_b32 s28, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s40, 0
+; SI-SDAG-NEXT:    s_mov_b32 s29, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[20:21], v[10:11], s[16:17], v[2:3]
-; SI-SDAG-NEXT:    s_mov_b32 s29, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s40, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[20:21], v[10:11], s[14:15], v[20:21]
 ; SI-SDAG-NEXT:    s_mov_b32 s41, 0xc090cc00
 ; SI-SDAG-NEXT:    v_mul_f64 v[22:23], v[20:21], s[20:21]
@@ -1947,26 +1947,26 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v17, v19, v21, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[12:13], v[24:25], -v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x509f79ff
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[22:23]|, v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3fd34413
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v10, v12, v22, vcc
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[22:23]|, v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[18:19], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v10, v12, v22, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0xa994fd21
+; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v11, v13, v23, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[18:19], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[24:25], v[20:21]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x494ea3e9
-; SI-GISEL-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[24:25], v[22:23]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0xbcaf48ad
+; SI-GISEL-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[24:25], v[22:23]
 ; SI-GISEL-NEXT:    v_mul_f64 v[26:27], v[12:13], v[20:21]
-; SI-GISEL-NEXT:    v_mul_f64 v[30:31], v[22:23], v[20:21]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x40026bb1
+; SI-GISEL-NEXT:    v_mul_f64 v[30:31], v[22:23], v[20:21]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[28:29], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[22:23], v[28:29], v[30:31]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v33, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[26:27], -v[16:17], v[18:19], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[18:19], v[6:7]
@@ -1978,12 +1978,12 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[34:35], v[36:37]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v35, 0x3efa0199
-; SI-GISEL-NEXT:    v_mul_f64 v[38:39], v[26:27], v[20:21]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[34:35]
+; SI-GISEL-NEXT:    v_mul_f64 v[38:39], v[26:27], v[20:21]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v49, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[26:27], v[28:29], v[38:39]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[48:49]
+; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[26:27], v[28:29], v[38:39]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v39, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_mul_f64 v[20:21], v[18:19], v[20:21]
@@ -2025,9 +2025,9 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], v[34:35]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], v[34:35]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], v[48:49]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], v[38:39]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], v[48:49]
+; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], v[38:39]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], v[50:51]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x40900000
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], v[24:25]
@@ -2042,11 +2042,11 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], v[24:25]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[26:27], v[14:15], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x7ff00000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], v[28:29]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v17, v16, v9, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[14:15], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[4:5], v[30:31]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0xc090cc00
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[18:19], v[20:21], v[52:53]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v8, vcc
@@ -2090,8 +2090,8 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s72, 0xbbb55516
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[10:11], v[8:9]
 ; VI-SDAG-NEXT:    s_mov_b32 s73, 0x40026bb1
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
 ; VI-SDAG-NEXT:    s_mov_b32 s18, 0x623fde64
@@ -2115,8 +2115,8 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[12:13], v[8:9], s[62:63]
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[10:11]
 ; VI-SDAG-NEXT:    s_mov_b32 s44, 0
-; VI-SDAG-NEXT:    s_mov_b32 s46, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s45, 0x40900000
+; VI-SDAG-NEXT:    s_mov_b32 s46, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s47, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[44:45], v[0:1]
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[46:47], v[2:3]
@@ -2372,8 +2372,8 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s72, 0xbbb55516
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[10:11], v[8:9]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s73, 0x40026bb1
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    s_mov_b32 s18, 0x623fde64
@@ -2397,8 +2397,8 @@ define <4 x double> @v_exp10_v4f64(<4 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[12:13], v[8:9], s[62:63]
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[10:11]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s44, 0
-; GFX900-SDAG-NEXT:    s_mov_b32 s46, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s45, 0x40900000
+; GFX900-SDAG-NEXT:    s_mov_b32 s46, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s47, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[44:45], v[0:1]
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[46:47], v[2:3]
@@ -2673,9 +2673,9 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], s[2:3], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -2706,8 +2706,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40900000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
@@ -2749,8 +2749,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -2784,8 +2784,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
@@ -2801,8 +2801,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x979a371
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x400a934f
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s2, 0x509f79ff
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    s_mov_b32 s2, 0x509f79ff
 ; VI-SDAG-NEXT:    s_mov_b32 s3, 0xbfd34413
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
@@ -2818,9 +2818,9 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s2, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s3, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], s[2:3], v[4:5]
-; VI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; VI-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -2884,8 +2884,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -2935,8 +2935,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x979a371
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x400a934f
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x509f79ff
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x509f79ff
 ; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0xbfd34413
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
@@ -2952,9 +2952,9 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], s[2:3], v[4:5]
-; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -3018,8 +3018,8 @@ define amdgpu_ps <2 x i32> @s_exp10_f64(double inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[4:5]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -3098,8 +3098,8 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0x494ea3e9
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0xbcaf48ad
 ; SI-SDAG-NEXT:    v_mul_f64 v[7:8], v[5:6], s[10:11]
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0xbbb55516
 ; SI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s12, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[5:6], s[12:13], v[7:8]
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s28, v12, v1
@@ -3114,9 +3114,9 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[6:7], v[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[8:9], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[4:5], s[10:11]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s14
+; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[4:5], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s15
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[12:13], v[12:13]
@@ -3160,8 +3160,8 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v13, v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], 1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x40900000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v12
@@ -3205,9 +3205,9 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[6:7], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x509f79ff
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fd34413
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[8:9]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fd34413
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[10:11], s[0:1]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -3264,14 +3264,14 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[8:9], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[16:17]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[16:17]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[8:9]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v13, v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v12, v10, v7, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
@@ -3301,16 +3301,16 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-SDAG-NEXT:    v_mul_f64 v[2:3], s[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x509f79ff
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s1
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x509f79ff
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0xbfd34413
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[0:1], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xa994fd21
@@ -3327,8 +3327,8 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[4:5], v[10:11]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, s7
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[4:5], s[4:5], v[8:9]
@@ -3373,8 +3373,8 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[4:5], v2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v3
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40900000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[4:5]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[8:9], s[2:3], v[6:7]
@@ -3415,13 +3415,14 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x494ea3e9
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0xbcaf48ad
 ; VI-GISEL-NEXT:    v_cvt_i32_f64_e32 v2, v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[8:9], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[10:11]
@@ -3464,7 +3465,6 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_cvt_i32_f64_e32 v8, v[0:1]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -3495,16 +3495,16 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[2:3], s[2:3], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x509f79ff
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s1
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x509f79ff
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0xbfd34413
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[4:5], v[4:5]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[0:1], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xa994fd21
@@ -3521,8 +3521,8 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[4:5], v[10:11]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, s7
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[4:5], s[4:5], v[8:9]
@@ -3567,8 +3567,8 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[4:5], v2
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v3
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40900000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[4:5]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[8:9], s[2:3], v[6:7]
@@ -3609,13 +3609,14 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x494ea3e9
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0xbcaf48ad
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f64_e32 v2, v[2:3]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], v[8:9]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[10:11]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[8:9], v[10:11]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[10:11]
@@ -3658,7 +3659,6 @@ define amdgpu_ps <4 x i32> @s_exp10_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f64_e32 v8, v[0:1]
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[2:3], v[2:3]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
@@ -3720,8 +3720,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s18, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[5:6], s[14:15], v[7:8]
 ; SI-SDAG-NEXT:    s_mov_b32 s19, 0x3e928af3
-; SI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, s18
+; SI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s19
 ; SI-SDAG-NEXT:    v_fma_f64 v[9:10], v[5:6], s[16:17], v[7:8]
@@ -3751,7 +3751,7 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[9:10], v[5:6], v[9:10], s[34:35]
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v11, v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[9:10], v[5:6], v[9:10], 1.0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v19, 0x7ff00000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v12, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[5:6], v[9:10], 1.0
 ; SI-SDAG-NEXT:    v_mul_f64 v[9:10], s[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
@@ -3766,11 +3766,11 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[9:10], s[8:9], v[5:6]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v12, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[13:14], v[9:10], s[10:11], v[5:6]
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s33, v17, v1
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[11:12]
 ; SI-SDAG-NEXT:    v_add_f64 v[17:18], v[0:1], v[4:5]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v19, 0x7ff00000
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v20, v19, v3, vcc
 ; SI-SDAG-NEXT:    v_add_f64 v[3:4], v[17:18], -v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, s[6:7]
@@ -3857,22 +3857,22 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mul_f64 v[0:1], s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fd34413
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[14:15], -v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[6:7]|, v[8:9]
 ; SI-GISEL-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fd34413
-; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], s[0:1]
+; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xa994fd21
+; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[16:17], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x494ea3e9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0xbcaf48ad
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[14:15], -v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[8:9]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0xbcaf48ad
 ; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[18:19]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
@@ -3880,25 +3880,25 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e928af3
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[10:11], s[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[12:13], v[14:15], v[20:21]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[10:11], s[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3ec71dee
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[16:17], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[10:11], s[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[12:13], v[22:23], v[24:25]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[16:17], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3efa0199
-; SI-GISEL-NEXT:    v_mul_f64 v[26:27], v[8:9], v[18:19]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[16:17], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[22:23], v[28:29]
+; SI-GISEL-NEXT:    v_mul_f64 v[26:27], v[8:9], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[4:5], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[12:13], v[16:17], v[22:23]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[4:5], v[26:27]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_mul_f64 v[18:19], v[10:11], v[18:19]
@@ -3941,9 +3941,9 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v15, v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[8:9], v[16:17]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v11, v14, v11, vcc
@@ -3986,9 +3986,9 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[2:3], s[4:5], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x509f79ff
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, s3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0x509f79ff
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0xbfd34413
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
@@ -3997,8 +3997,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v11, s1
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[2:3], s[6:7], v[6:7]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[4:5], s[6:7], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[0:1], s[6:7], v[10:11]
@@ -4018,8 +4018,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[6:7], v[12:13]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[6:7], v[14:15]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[6:7], v[16:17]
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s9
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[6:7], v[12:13]
@@ -4076,8 +4076,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v14
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x40900000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[14:15], s[4:5], v[8:9]
@@ -4138,8 +4138,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[16:17]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[12:13]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[16:17], v[8:9], v[14:15], v[12:13]
 ; VI-GISEL-NEXT:    v_fma_f64 v[18:19], v[10:11], v[14:15], v[12:13]
@@ -4201,8 +4201,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[14:15]
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[14:15]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[6:7]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
@@ -4236,9 +4236,9 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[2:3], s[4:5], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x509f79ff
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, s3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x509f79ff
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0xbfd34413
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
@@ -4247,8 +4247,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v11, s1
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[2:3], s[6:7], v[6:7]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[4:5], s[6:7], v[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[0:1], s[6:7], v[10:11]
@@ -4268,8 +4268,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[6:7], v[12:13]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[6:7], v[14:15]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[6:7], v[16:17]
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s9
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[6:7], v[12:13]
@@ -4326,8 +4326,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[6:7], v14
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v4
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x40900000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[14:15], s[4:5], v[8:9]
@@ -4388,8 +4388,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[16:17]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[12:13]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[16:17], v[8:9], v[14:15], v[12:13]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[18:19], v[10:11], v[14:15], v[12:13]
@@ -4451,8 +4451,8 @@ define amdgpu_ps <6 x i32> @s_exp10_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[14:15]
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[14:15]
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[6:7]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
@@ -4532,11 +4532,11 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[12:13]|, s[8:9]
 ; SI-SDAG-NEXT:    s_mov_b32 s20, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s21, 0x3e928af3
-; SI-SDAG-NEXT:    s_mov_b32 s18, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v13, v15, v13, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
 ; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[10:11], s[14:15]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SI-SDAG-NEXT:    s_mov_b32 s18, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s19, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v17, s21
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[16:17], v[14:15]
@@ -4550,25 +4550,25 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s24, 0x14761f6e
 ; SI-SDAG-NEXT:    s_mov_b32 s25, 0x3f2a01a0
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], v[14:15], s[24:25]
-; SI-SDAG-NEXT:    s_mov_b32 s26, 0x1852b7b0
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v19, v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[10:11], s[18:19], v[16:17]
+; SI-SDAG-NEXT:    s_mov_b32 s26, 0x1852b7b0
 ; SI-SDAG-NEXT:    s_mov_b32 s27, 0x3f56c16c
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], v[14:15], s[26:27]
-; SI-SDAG-NEXT:    s_mov_b32 s28, 0x11122322
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], s[20:21]
+; SI-SDAG-NEXT:    s_mov_b32 s28, 0x11122322
 ; SI-SDAG-NEXT:    s_mov_b32 s29, 0x3f811111
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], v[14:15], s[28:29]
-; SI-SDAG-NEXT:    s_mov_b32 s30, 0x555502a1
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], s[22:23]
+; SI-SDAG-NEXT:    s_mov_b32 s30, 0x555502a1
 ; SI-SDAG-NEXT:    s_mov_b32 s31, 0x3fa55555
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], v[14:15], s[30:31]
-; SI-SDAG-NEXT:    s_mov_b32 s34, 0x55555511
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], s[24:25]
+; SI-SDAG-NEXT:    s_mov_b32 s34, 0x55555511
 ; SI-SDAG-NEXT:    s_mov_b32 s35, 0x3fc55555
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], v[14:15], s[34:35]
-; SI-SDAG-NEXT:    s_mov_b32 s36, 11
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], s[26:27]
+; SI-SDAG-NEXT:    s_mov_b32 s36, 11
 ; SI-SDAG-NEXT:    s_mov_b32 s37, 0x3fe00000
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], v[14:15], s[36:37]
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], s[28:29]
@@ -4591,23 +4591,23 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f64 v[14:15], v[0:1], v[4:5]
 ; SI-SDAG-NEXT:    v_fma_f64 v[18:19], v[10:11], s[18:19], v[16:17]
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[14:15], -v[4:5]
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[8:9]
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[10:11], v[18:19], s[20:21]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], s[22:23]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, s1
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], s[22:23]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, s0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[10:11], v[4:5]
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], s[24:25]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[10:11], v[4:5]
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[10:11], v[14:15], s[26:27]
-; SI-SDAG-NEXT:    v_mul_f64 v[18:19], v[4:5], s[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[0:1], s[12:13], v[4:5]
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v20, v[8:9]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[10:11], v[14:15], s[28:29]
+; SI-SDAG-NEXT:    v_mul_f64 v[18:19], v[4:5], s[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[10:11], v[8:9], s[30:31]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[16:17], v[18:19]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v18, 0
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[10:11], v[8:9], s[30:31]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v19, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[10:11], v[8:9], s[34:35]
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[6:7], v[18:19]
@@ -4702,21 +4702,21 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3fd34413
 ; SI-GISEL-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[2:3], v[12:13], s[0:1]
+; SI-GISEL-NEXT:    v_add_f64 v[18:19], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v15, v6, v15, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xa994fd21
-; SI-GISEL-NEXT:    v_add_f64 v[18:19], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[2:3], v[6:7], v[16:17]
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[18:19], -v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x494ea3e9
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[12:13], s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x494ea3e9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0xbcaf48ad
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[16:17], v[18:19]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbbb55516
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[16:17], v[20:21], v[4:5]
 ; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[8:9], v[18:19]
@@ -4724,8 +4724,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[16:17]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[14:15], v[6:7], v[22:23]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[4:5], v[22:23], v[24:25]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x623fde64
@@ -4734,21 +4734,21 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[4:5], v[26:27], v[28:29]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v33, 0x3efa0199
-; SI-GISEL-NEXT:    v_mul_f64 v[30:31], v[16:17], v[18:19]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[0:1], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[26:27], v[32:33]
+; SI-GISEL-NEXT:    v_mul_f64 v[30:31], v[16:17], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[20:21], v[30:31]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[12:13], v[26:27]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[20:21], v[30:31]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mul_f64 v[18:19], v[6:7], v[18:19]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[12:13], v[30:31]
+; SI-GISEL-NEXT:    v_mul_f64 v[18:19], v[6:7], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v35, 0x3f811111
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[20:21], v[18:19]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[12:13], v[34:35]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[20:21], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3fa55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[12:13], v[18:19]
@@ -4850,8 +4850,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], s[4:5], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], s[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0x509f79ff
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, s7
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0x509f79ff
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0xbfd34413
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
@@ -4860,8 +4860,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v10, s4
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    s_mov_b32 s10, 0xa994fd21
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; VI-SDAG-NEXT:    s_mov_b32 s10, 0xa994fd21
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[2:3], s[8:9], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[4:5], s[8:9], v[10:11]
 ; VI-SDAG-NEXT:    s_mov_b32 s11, 0x3c49dc1d
@@ -4890,8 +4890,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], s[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[10:11], v[18:19]
 ; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[14:15], s[8:9]
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s18, 0x11122322
 ; VI-SDAG-NEXT:    s_mov_b32 s19, 0x3f811111
 ; VI-SDAG-NEXT:    s_mov_b32 s20, 0x555502a1
@@ -4962,8 +4962,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_fma_f64 v[16:17], v[14:15], v[16:17], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[14:15], v[16:17], 1.0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v14, v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v19, 0x40900000
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[10:11], v21
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[18:19]
@@ -5156,8 +5156,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[4:5], s[4:5], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[6:7], s[2:3], v[0:1]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x509f79ff
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, s7
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x509f79ff
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0xbfd34413
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
@@ -5166,8 +5166,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v10, s4
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
-; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0xa994fd21
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0xa994fd21
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[2:3], s[8:9], v[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[4:5], s[8:9], v[10:11]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s11, 0x3c49dc1d
@@ -5196,8 +5196,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], s[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[10:11], v[18:19]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[18:19], v[14:15], s[8:9]
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s18, 0x11122322
 ; GFX900-SDAG-NEXT:    s_mov_b32 s19, 0x3f811111
 ; GFX900-SDAG-NEXT:    s_mov_b32 s20, 0x555502a1
@@ -5268,8 +5268,8 @@ define amdgpu_ps <8 x i32> @s_exp10_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[16:17], v[14:15], v[16:17], 1.0
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], 1.0
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[2:3], v[14:15], v[16:17], 1.0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v14, v[0:1]
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v19, 0x40900000
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[10:11], v21
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[18:19]
@@ -5486,9 +5486,9 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5519,8 +5519,8 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5561,8 +5561,8 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5596,8 +5596,8 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -5628,9 +5628,9 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5692,8 +5692,8 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5758,9 +5758,9 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5822,8 +5822,8 @@ define double @v_exp10_fabs_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5900,9 +5900,9 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5933,8 +5933,8 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xc0900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x4090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5975,8 +5975,8 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6010,8 +6010,8 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, -|v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -6042,9 +6042,9 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6106,8 +6106,8 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6172,9 +6172,9 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6236,8 +6236,8 @@ define double @v_exp10_fneg_fabs_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6315,9 +6315,9 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6348,8 +6348,8 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xc0900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x4090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
@@ -6390,8 +6390,8 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6425,8 +6425,8 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, -v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -6457,9 +6457,9 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6521,8 +6521,8 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6587,9 +6587,9 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6651,8 +6651,8 @@ define double @v_exp10_fneg_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6729,9 +6729,9 @@ define double @v_exp10_f64_fast(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6798,8 +6798,8 @@ define double @v_exp10_f64_fast(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6857,9 +6857,9 @@ define double @v_exp10_f64_fast(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6916,8 +6916,8 @@ define double @v_exp10_f64_fast(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6975,9 +6975,9 @@ define double @v_exp10_f64_fast(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7034,8 +7034,8 @@ define double @v_exp10_f64_fast(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7106,9 +7106,9 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7139,8 +7139,8 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -7181,8 +7181,8 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7216,8 +7216,8 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -7248,9 +7248,9 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7312,8 +7312,8 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7378,9 +7378,9 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7442,8 +7442,8 @@ define double @v_exp10_f64_afn(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7519,9 +7519,9 @@ define double @v_exp10_f64_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7588,8 +7588,8 @@ define double @v_exp10_f64_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7647,9 +7647,9 @@ define double @v_exp10_f64_ninf(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7706,8 +7706,8 @@ define double @v_exp10_f64_ninf(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7765,9 +7765,9 @@ define double @v_exp10_f64_ninf(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7824,8 +7824,8 @@ define double @v_exp10_f64_ninf(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7896,9 +7896,9 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7929,8 +7929,8 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -7971,8 +7971,8 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8006,8 +8006,8 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -8038,9 +8038,9 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8102,8 +8102,8 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8168,9 +8168,9 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8232,8 +8232,8 @@ define double @v_exp10_f64_nnan(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8309,9 +8309,9 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8342,8 +8342,8 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -8384,8 +8384,8 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8419,8 +8419,8 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -8451,9 +8451,9 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8515,8 +8515,8 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8581,9 +8581,9 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8645,8 +8645,8 @@ define double @v_fabs_exp10_f64_afn(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8723,9 +8723,9 @@ define double @v_exp10_f64_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8792,8 +8792,8 @@ define double @v_exp10_f64_nnan_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8851,9 +8851,9 @@ define double @v_exp10_f64_nnan_ninf(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8910,8 +8910,8 @@ define double @v_exp10_f64_nnan_ninf(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8969,9 +8969,9 @@ define double @v_exp10_f64_nnan_ninf(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9028,8 +9028,8 @@ define double @v_exp10_f64_nnan_ninf(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9128,9 +9128,9 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9161,8 +9161,8 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -9197,40 +9197,40 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[8:9], v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0xbc49dc1d
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x494ea3e9
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[10:11], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x494ea3e9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0xbcaf48ad
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[14:15]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40026bb1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[0:1]
@@ -9238,10 +9238,10 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[2:3], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -9274,9 +9274,9 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9340,8 +9340,8 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9408,9 +9408,9 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9474,8 +9474,8 @@ define double @v_exp10_f64_from_fpext_f16(half %src) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9534,8 +9534,8 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    s_brev_b32 s4, -2
 ; SI-SDAG-NEXT:    v_bfi_b32 v5, s4, v4, v3
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[6:7]
@@ -9553,9 +9553,9 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9586,8 +9586,8 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -9618,19 +9618,19 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fd34413
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xa994fd21
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[10:11], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xa994fd21
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x494ea3e9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0xbcaf48ad
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xbbb55516
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9697,9 +9697,9 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9762,8 +9762,8 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9829,9 +9829,9 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9894,8 +9894,8 @@ define double @v_exp10_f64_from_fpext_f32(float %src) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9978,9 +9978,9 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10011,8 +10011,8 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[2:3]
@@ -10034,12 +10034,12 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x432fffff
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x400a934f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x509f79ff
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fd34413
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xa994fd21
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x979a371
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x400a934f
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xa994fd21
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0xbc49dc1d
 ; SI-GISEL-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x494ea3e9
@@ -10058,33 +10058,33 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[10:11], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
 ; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[14:15]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[0:1]
@@ -10092,10 +10092,10 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[2:3], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
@@ -10129,9 +10129,9 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10196,8 +10196,8 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -10265,9 +10265,9 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10332,8 +10332,8 @@ define double @v_exp10_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -10411,9 +10411,9 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10444,8 +10444,8 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -10486,8 +10486,8 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -10521,8 +10521,8 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -10553,9 +10553,9 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10617,8 +10617,8 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -10683,9 +10683,9 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10747,8 +10747,8 @@ define double @v_exp10_f64_contract(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -10824,9 +10824,9 @@ define double @v_exp10_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -10893,8 +10893,8 @@ define double @v_exp10_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40026bb1
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -10952,9 +10952,9 @@ define double @v_exp10_f64_contract_nnan_ninf(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -11011,8 +11011,8 @@ define double @v_exp10_f64_contract_nnan_ninf(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -11070,9 +11070,9 @@ define double @v_exp10_f64_contract_nnan_ninf(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbbb55516
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x40026bb1
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -11129,8 +11129,8 @@ define double @v_exp10_f64_contract_nnan_ninf(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.f64.ll
index 1ce840640f059..8587e5a510411 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.f64.ll
@@ -14,8 +14,8 @@ define double @v_exp2_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -28,9 +28,9 @@ define double @v_exp2_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -61,8 +61,8 @@ define double @v_exp2_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -95,8 +95,8 @@ define double @v_exp2_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -130,8 +130,8 @@ define double @v_exp2_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -154,9 +154,9 @@ define double @v_exp2_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -210,8 +210,8 @@ define double @v_exp2_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -268,9 +268,9 @@ define double @v_exp2_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -324,8 +324,8 @@ define double @v_exp2_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -394,16 +394,16 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3c7abc9e
 ; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[8:9], s[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0xfefa39ef
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x3fe62e42
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_add_f64 v[14:15], v[2:3], -v[4:5]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[8:9], v[10:11]
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3e928af3
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, s11
 ; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[14:15], s[6:7]
@@ -446,9 +446,9 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], s[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40900000
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], 1.0
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v4, v[4:5]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v16
@@ -500,8 +500,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[12:13], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[12:13], v[14:15], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[12:13], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[8:9], v[12:13], v[14:15]
@@ -540,8 +540,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[10:11], v[16:17], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], 1.0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x40900000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[14:15]
 ; SI-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[4:5], v[0:1], v[16:17]
@@ -571,8 +571,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[2:3]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3c7abc9e
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s8, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_add_f64 v[8:9], v[0:1], -v[4:5]
@@ -585,8 +585,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[4:5], v[12:13]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[4:5], v[14:15]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s7
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v12, s6
 ; VI-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -664,8 +664,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[16:17], v[14:15]
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], v[12:13]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[16:17], v[8:9], v[12:13], v[14:15]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], v[10:11], v[12:13], v[14:15]
@@ -707,8 +707,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], 1.0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[10:11], v6
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[10:11], v6
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[8:9]
@@ -733,8 +733,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[6:7], v[2:3]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3c7abc9e
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_add_f64 v[8:9], v[0:1], -v[4:5]
@@ -747,8 +747,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[4:5], v[12:13]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[4:5], v[14:15]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s7
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v12, s6
 ; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -826,8 +826,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[16:17], v[14:15]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[16:17], v[12:13]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v14, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[16:17], v[8:9], v[12:13], v[14:15]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[12:13], v[10:11], v[12:13], v[14:15]
@@ -869,8 +869,8 @@ define <2 x double> @v_exp2_v2f64(<2 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], 1.0
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0
-; GFX900-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[10:11], v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
+; GFX900-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[10:11], v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[8:9]
@@ -911,12 +911,12 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3c7abc9e
 ; SI-SDAG-NEXT:    v_mul_f64 v[13:14], v[7:8], s[10:11]
 ; SI-SDAG-NEXT:    s_mov_b32 s12, 0xfefa39ef
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x3fe62e42
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[13:14], v[7:8], s[12:13], v[13:14]
-; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s5
+; SI-SDAG-NEXT:    s_mov_b32 s14, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-SDAG-NEXT:    v_fma_f64 v[15:16], v[13:14], s[14:15], v[8:9]
@@ -948,11 +948,11 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[15:16], v[13:14], v[15:16], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s40, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[11:12], v[13:14], v[15:16], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s42, 0
+; SI-SDAG-NEXT:    s_mov_b32 s41, 0x40900000
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[11:12], v[11:12], v7
 ; SI-SDAG-NEXT:    v_bfi_b32 v7, s46, v10, v3
 ; SI-SDAG-NEXT:    v_add_f64 v[13:14], v[2:3], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s41, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s42, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s43, 0xc090cc00
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[40:41], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[42:43], v[0:1]
@@ -1034,66 +1034,66 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v1, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[12:13], v[0:1], -v[6:7]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x3b39803f
 ; SI-GISEL-NEXT:    v_add_f64 v[18:19], v[18:19], -v[8:9]
 ; SI-GISEL-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_or_b32_e32 v9, 0x43300000, v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[14:15]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, v[10:11]
 ; SI-GISEL-NEXT:    v_add_f64 v[28:29], v[4:5], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[20:21], v[16:17]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v16, v18, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v17, v19, v3, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[8:9], v[28:29], -v[8:9]
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[4:5]|, v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_add_f64 v[32:33], v[2:3], -v[16:17]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x6a5dcb37
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0x3e928af3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v5, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[18:19], v[22:23]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x623fde64
 ; SI-GISEL-NEXT:    v_mul_f64 v[36:37], v[32:33], v[14:15]
 ; SI-GISEL-NEXT:    v_add_f64 v[50:51], v[4:5], -v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3ec71dee
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[26:27]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[32:33], v[32:33], v[20:21], v[36:37]
 ; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[50:51], v[14:15]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3efa0199
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[30:31]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[50:51], v[20:21], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[50:51], v[32:33], v[18:19], v[22:23]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v34, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v35, 0x3f2a01a0
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[34:35]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[22:23]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[32:33], v[50:51], v[26:27]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[28:29]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[32:33], v[22:23], v[30:31]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v38, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v39, 0x3f811111
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[12:13], v[24:25], v[38:39]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[32:33], v[22:23], v[34:35]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3fa55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[24:25]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[32:33], v[22:23], v[28:29]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v48, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v49, 0x3fc55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[48:49]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[30:31]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v26, v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[32:33], v[22:23], v[38:39]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v37, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], v[36:37]
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[34:35]
@@ -1102,16 +1102,16 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[28:29]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[32:33], v[6:7], v[48:49]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[12:13], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[38:39]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[32:33], v[6:7], v[36:37]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v26
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], v[24:25]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[32:33], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v16, v[16:17]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0xc090cc00
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
@@ -1173,8 +1173,8 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_fma_f64 v[12:13], v[12:13], s[6:7], v[18:19]
 ; VI-SDAG-NEXT:    s_mov_b32 s21, 0x3fe00000
 ; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[14:15], s[4:5]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], v[14:15], s[6:7], v[18:19]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v19, s5
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v18, s4
@@ -1266,8 +1266,8 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[16:17], v[18:19], v[22:23], v[16:17]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[22:23], v[14:15]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v18, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[22:23], v[10:11], v[18:19], v[20:21]
 ; VI-GISEL-NEXT:    v_fma_f64 v[24:25], v[16:17], v[18:19], v[20:21]
@@ -1381,8 +1381,8 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[12:13], v[12:13], s[6:7], v[18:19]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s21, 0x3fe00000
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[18:19], v[14:15], s[4:5]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[14:15], v[14:15], s[6:7], v[18:19]
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v19, s5
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v18, s4
@@ -1474,8 +1474,8 @@ define <3 x double> @v_exp2_v3f64(<3 x double> %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[16:17], v[18:19], v[22:23], v[16:17]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[14:15], v[20:21], v[22:23], v[14:15]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v18, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v20, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[22:23], v[10:11], v[18:19], v[20:21]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[24:25], v[16:17], v[18:19], v[20:21]
@@ -1580,12 +1580,12 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s19, 0x3c7abc9e
 ; SI-SDAG-NEXT:    v_mul_f64 v[13:14], v[11:12], s[18:19]
 ; SI-SDAG-NEXT:    s_mov_b32 s20, 0xfefa39ef
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s21, 0x3fe62e42
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[11:12], s[20:21], v[13:14]
-; SI-SDAG-NEXT:    s_mov_b32 s22, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v13, s5
+; SI-SDAG-NEXT:    s_mov_b32 s22, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s23, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v12, s4
 ; SI-SDAG-NEXT:    v_fma_f64 v[18:19], v[14:15], s[22:23], v[12:13]
@@ -1616,9 +1616,9 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-SDAG-NEXT:    v_cvt_i32_f64_e32 v8, v[8:9]
 ; SI-SDAG-NEXT:    v_fma_f64 v[18:19], v[14:15], v[18:19], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s58, 0
-; SI-SDAG-NEXT:    s_mov_b32 s60, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[14:15], v[14:15], v[18:19], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s59, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s60, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s61, 0xc090cc00
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[58:59], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[60:61], v[0:1]
@@ -1725,16 +1725,16 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x432fffff
 ; SI-GISEL-NEXT:    v_add_f64 v[8:9], v[8:9], -v[10:11]
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[18:19]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x3b39803f
+; SI-GISEL-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[16:17], v[0:1], -v[8:9]
-; SI-GISEL-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_or_b32_e32 v11, 0x43300000, v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[20:21], v[16:17], v[14:15]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_add_f64 v[22:23], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[16:17], v[12:13], v[20:21]
 ; SI-GISEL-NEXT:    v_add_f64 v[16:17], v[22:23], -v[10:11]
@@ -1758,8 +1758,8 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[22:23], v[26:27], v[12:13], v[30:31]
 ; SI-GISEL-NEXT:    v_add_f64 v[26:27], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v33, 0x3e928af3
 ; SI-GISEL-NEXT:    v_add_f64 v[10:11], v[26:27], -v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[26:27], v[24:25], v[30:31], v[32:33]
@@ -1770,9 +1770,9 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3efa0199
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[24:25], v[18:19], v[26:27]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 0x14761f6e
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v6, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v7, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v36, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v37, 0x3f2a01a0
 ; SI-GISEL-NEXT:    v_add_f64 v[38:39], v[6:7], -v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[24:25], v[18:19], v[36:37]
@@ -1817,9 +1817,9 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], v[36:37]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0
-; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], v[48:49]
 ; SI-GISEL-NEXT:    v_fma_f64 v[24:25], v[12:13], v[24:25], v[36:37]
+; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], v[48:49]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v30, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], v[50:51]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v31, 0x40900000
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], v[18:19]
@@ -1834,11 +1834,11 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[12:13], v[24:25], v[18:19]
 ; SI-GISEL-NEXT:    v_fma_f64 v[20:21], v[22:23], v[20:21], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x7ff00000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[12:13], v[18:19], v[38:39]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v17, v16, v9, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[8:9], v[20:21], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[4:5], v[30:31]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0xc090cc00
 ; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[12:13], v[18:19], v[52:53]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v8, vcc
@@ -1875,8 +1875,8 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s57, 0x3c7abc9e
 ; VI-SDAG-NEXT:    s_mov_b32 s58, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s59, 0x3fe62e42
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_add_f64 v[8:9], v[0:1], -v[10:11]
 ; VI-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
@@ -1903,8 +1903,8 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[10:11]
 ; VI-SDAG-NEXT:    s_mov_b32 s44, 0
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], v[12:13], s[16:17], v[8:9]
-; VI-SDAG-NEXT:    s_mov_b32 s46, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s45, 0x40900000
+; VI-SDAG-NEXT:    s_mov_b32 s46, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s47, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[44:45], v[0:1]
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[46:47], v[0:1]
@@ -2129,8 +2129,8 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s57, 0x3c7abc9e
 ; GFX900-SDAG-NEXT:    s_mov_b32 s58, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s59, 0x3fe62e42
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s16, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_add_f64 v[8:9], v[0:1], -v[10:11]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s17, 0x3e5ade15
@@ -2157,8 +2157,8 @@ define <4 x double> @v_exp2_v4f64(<4 x double> %in) #0 {
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v10, v[10:11]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s44, 0
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[14:15], v[12:13], s[16:17], v[8:9]
-; GFX900-SDAG-NEXT:    s_mov_b32 s46, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s45, 0x40900000
+; GFX900-SDAG-NEXT:    s_mov_b32 s46, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s47, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[44:45], v[0:1]
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[46:47], v[0:1]
@@ -2402,9 +2402,9 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], s[2:3], v[4:5]
-; SI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -2435,8 +2435,8 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40900000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
@@ -2477,8 +2477,8 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -2512,8 +2512,8 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
@@ -2536,9 +2536,9 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s3, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], s[2:3], v[4:5]
-; VI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; VI-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -2594,8 +2594,8 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -2652,9 +2652,9 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], s[2:3], v[4:5]
-; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], s[2:3], v[4:5]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x623fde64
@@ -2710,8 +2710,8 @@ define amdgpu_ps <2 x i32> @s_exp2_f64(double inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[4:5]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
@@ -2783,12 +2783,12 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3c7abc9e
 ; SI-SDAG-NEXT:    v_mul_f64 v[7:8], v[1:2], s[4:5]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xfefa39ef
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3fe62e42
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[1:2], s[6:7], v[7:8]
-; SI-SDAG-NEXT:    s_mov_b32 s10, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v10, s9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s8
 ; SI-SDAG-NEXT:    v_fma_f64 v[1:2], v[7:8], s[10:11], v[9:10]
@@ -2842,9 +2842,9 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], s[22:23]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], s[4:5]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x40900000
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[2:3], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[8:9]
@@ -2890,8 +2890,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    s_and_b64 s[4:5], s[2:3], s[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe62e42
 ; SI-GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[8:9]
@@ -2910,8 +2910,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3fc55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[8:9], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[10:11]
@@ -2947,14 +2947,14 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[6:7], v[8:9], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[16:17]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[16:17]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[8:9]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v2, v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v12, v10, v5, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
@@ -2982,8 +2982,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], s[0:1]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3c7abc9e
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_add_f64 v[4:5], s[2:3], -v[0:1]
 ; VI-SDAG-NEXT:    v_add_f64 v[6:7], s[0:1], -v[2:3]
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
@@ -2994,8 +2994,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[4:5], v[10:11]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, s7
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[4:5], s[4:5], v[8:9]
@@ -3040,8 +3040,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[4:5], v0
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v2
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40900000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[4:5]
 ; VI-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[8:9], s[2:3], v[6:7]
@@ -3077,8 +3077,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[8:9], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[10:11]
@@ -3150,8 +3150,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[2:3], s[0:1]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3b39803f
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3c7abc9e
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_add_f64 v[4:5], s[2:3], -v[0:1]
 ; GFX900-SDAG-NEXT:    v_add_f64 v[6:7], s[0:1], -v[2:3]
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
@@ -3162,8 +3162,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[4:5], v[10:11]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, s7
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[4:5], s[4:5], v[8:9]
@@ -3208,8 +3208,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[4:5], v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v2
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40900000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[2:3], v[4:5]
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[8:9], s[2:3], v[6:7]
@@ -3245,8 +3245,8 @@ define amdgpu_ps <4 x i32> @s_exp2_v2f64(<2 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[10:11]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[12:13], v[4:5], v[8:9], v[10:11]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[8:9], v[6:7], v[8:9], v[10:11]
@@ -3339,12 +3339,12 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x3c7abc9e
 ; SI-SDAG-NEXT:    v_mul_f64 v[7:8], v[5:6], s[8:9]
 ; SI-SDAG-NEXT:    s_mov_b32 s10, 0xfefa39ef
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3fe62e42
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3e928af3
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[5:6], s[10:11], v[7:8]
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, s7
+; SI-SDAG-NEXT:    s_mov_b32 s12, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s13, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, s6
 ; SI-SDAG-NEXT:    v_fma_f64 v[9:10], v[5:6], s[12:13], v[7:8]
@@ -3478,8 +3478,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    s_and_b64 s[10:11], s[2:3], s[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe62e42
 ; SI-GISEL-NEXT:    s_or_b64 s[10:11], s[10:11], s[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[8:9]
@@ -3501,26 +3501,26 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f64 v[14:15], v[14:15], -s[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v18, s4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v14, v18, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e5ade15
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, s5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v14, v18, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v18, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v19, 0x3e928af3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v15, v20, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[4:5], v[12:13], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3ec71dee
-; SI-GISEL-NEXT:    v_add_f64 v[16:17], s[2:3], -v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[4:5], v[14:15], v[20:21]
+; SI-GISEL-NEXT:    v_add_f64 v[16:17], s[2:3], -v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3efa0199
-; SI-GISEL-NEXT:    v_mul_f64 v[22:23], v[16:17], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[4:5], v[14:15], v[24:25]
+; SI-GISEL-NEXT:    v_mul_f64 v[22:23], v[16:17], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3f2a01a0
 ; SI-GISEL-NEXT:    v_add_f64 v[26:27], s[4:5], -v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[10:11], v[22:23]
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[4:5], v[14:15], v[28:29]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[10:11], v[22:23]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[26:27], v[6:7]
@@ -3550,23 +3550,23 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[28:29]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[12:13], v[28:29]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[22:23]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[12:13], v[22:23]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[30:31]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v18, v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[30:31]
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], v[6:7], v[12:13], v[22:23]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[14:15]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[26:27]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v18
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], v[32:33]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[10:11], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[16:17], v[10:11], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[6:7], v[12:13], v[30:31]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x40900000
+; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x7ff00000
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v17, v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[6:7], v[10:11], v[14:15]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0
-; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[12:13]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0x7ff00000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
@@ -3606,8 +3606,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], s[0:1]
 ; VI-SDAG-NEXT:    s_mov_b32 s6, 0x3b39803f
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3c7abc9e
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_add_f64 v[6:7], s[4:5], -v[0:1]
 ; VI-SDAG-NEXT:    v_add_f64 v[8:9], s[2:3], -v[2:3]
 ; VI-SDAG-NEXT:    v_add_f64 v[10:11], s[0:1], -v[4:5]
@@ -3619,8 +3619,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[6:7], v[12:13]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[6:7], v[14:15]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[6:7], v[16:17]
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v13, s9
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[6:7], v[12:13]
@@ -3677,10 +3677,10 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_cvt_i32_f64_e32 v11, v[4:5]
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v14
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v10
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x40900000
-; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v10
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v11
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[0:1], v[6:7]
 ; VI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[6:7]
@@ -3728,8 +3728,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[18:19], v[16:17]
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[12:13]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[16:17], v[6:7], v[14:15], v[12:13]
 ; VI-GISEL-NEXT:    v_fma_f64 v[18:19], v[8:9], v[14:15], v[12:13]
@@ -3791,8 +3791,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v11
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v10
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[6:7], s[2:3], v[14:15]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
@@ -3824,8 +3824,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_rndne_f64_e32 v[4:5], s[0:1]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x3b39803f
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3c7abc9e
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_add_f64 v[6:7], s[4:5], -v[0:1]
 ; GFX900-SDAG-NEXT:    v_add_f64 v[8:9], s[2:3], -v[2:3]
 ; GFX900-SDAG-NEXT:    v_add_f64 v[10:11], s[0:1], -v[4:5]
@@ -3837,8 +3837,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], s[6:7], v[12:13]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[6:7], v[14:15]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], s[6:7], v[16:17]
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v13, s9
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[14:15], v[6:7], s[6:7], v[12:13]
@@ -3895,10 +3895,10 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f64_e32 v11, v[4:5]
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[6:7], v14
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX900-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v10
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x40900000
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX900-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v10
 ; GFX900-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v11
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v9, 0xc090cc00
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[0:1], v[6:7]
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[6:7]
@@ -3946,8 +3946,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[18:19], v[16:17]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[12:13]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v14, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v15, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[16:17], v[6:7], v[14:15], v[12:13]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[18:19], v[8:9], v[14:15], v[12:13]
@@ -4009,8 +4009,8 @@ define amdgpu_ps <6 x i32> @s_exp2_v3f64(<3 x double> inreg %in) #0 {
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v11
 ; GFX900-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[8:9], v10
 ; GFX900-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[6:7], s[2:3], v[14:15]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v17, 0x7ff00000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc090cc00
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
@@ -4066,8 +4066,8 @@ define amdgpu_ps <8 x i32> @s_exp2_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s14, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[1:2], s[12:13], v[7:8]
 ; SI-SDAG-NEXT:    s_mov_b32 s15, 0x3e928af3
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, s14
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v10, s15
 ; SI-SDAG-NEXT:    v_fma_f64 v[1:2], v[7:8], s[8:9], v[9:10]
@@ -4141,8 +4141,8 @@ define amdgpu_ps <8 x i32> @s_exp2_v4f64(<4 x double> inreg %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f64 v[0:1], v[17:18], -v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[5:6], s[8:9], v[9:10]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[5:6], v[17:18], s[14:15]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], s[0:1], -v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[17:18], v[5:6], v[17:18], s[16:17]
@@ -4232,13 +4232,13 @@ define amdgpu_ps <8 x i32> @s_exp2_v4f64(<4 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; SI-GISEL-NEXT:    s_or_b64 s[12:13], s[12:13], s[10:11]
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], s[0:1], -v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v12, s12
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3b39803f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3c7abc9e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, s13
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_add_f64 v[12:13], s[2:3], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_add_f64 v[8:9], v[12:13], -s[12:13]
@@ -4272,24 +4272,24 @@ define amdgpu_ps <8 x i32> @s_exp2_v4f64(<4 x double> inreg %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v15, v19, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[16:17], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v21, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v22, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v23, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[14:15], v[16:17], v[10:11], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[4:5], v[20:21], v[22:23]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v24, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v25, 0x3ec71dee
-; SI-GISEL-NEXT:    v_add_f64 v[18:19], s[4:5], -v[12:13]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[4:5], v[16:17], v[24:25]
+; SI-GISEL-NEXT:    v_add_f64 v[18:19], s[4:5], -v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v28, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v29, 0x3efa0199
-; SI-GISEL-NEXT:    v_mul_f64 v[26:27], v[18:19], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[4:5], v[16:17], v[28:29]
+; SI-GISEL-NEXT:    v_mul_f64 v[26:27], v[18:19], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v32, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v33, 0x3f2a01a0
 ; SI-GISEL-NEXT:    v_add_f64 v[30:31], s[6:7], -v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[18:19], v[10:11], v[26:27]
 ; SI-GISEL-NEXT:    v_fma_f64 v[16:17], v[4:5], v[16:17], v[32:33]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], v[18:19], v[10:11], v[26:27]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v26, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v27, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[30:31], v[6:7]
@@ -4419,8 +4419,8 @@ define amdgpu_ps <8 x i32> @s_exp2_v4f64(<4 x double> inreg %in) #0 {
 ; VI-SDAG-NEXT:    v_mul_f64 v[12:13], v[14:15], s[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[10:11], v[16:17]
 ; VI-SDAG-NEXT:    v_mul_f64 v[16:17], v[18:19], s[8:9]
-; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; VI-SDAG-NEXT:    s_mov_b32 s22, 0x55555511
 ; VI-SDAG-NEXT:    s_mov_b32 s23, 0x3fc55555
 ; VI-SDAG-NEXT:    s_mov_b32 s24, 11
@@ -4687,8 +4687,8 @@ define amdgpu_ps <8 x i32> @s_exp2_v4f64(<4 x double> inreg %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[12:13], v[14:15], s[8:9]
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], s[10:11], v[16:17]
 ; GFX900-SDAG-NEXT:    v_mul_f64 v[16:17], v[18:19], s[8:9]
-; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    s_mov_b32 s22, 0x55555511
 ; GFX900-SDAG-NEXT:    s_mov_b32 s23, 0x3fc55555
 ; GFX900-SDAG-NEXT:    s_mov_b32 s24, 11
@@ -4935,10 +4935,10 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x43300000
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], |v[0:1]|, s[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc3300000
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[2:3], s[6:7]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -4952,9 +4952,9 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -4985,8 +4985,8 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5020,8 +5020,8 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5055,8 +5055,8 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -5079,9 +5079,9 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5135,8 +5135,8 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5193,9 +5193,9 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5249,8 +5249,8 @@ define double @v_exp2_fabs_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5303,10 +5303,10 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc3300000
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], -|v[0:1]|, s[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x43300000
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[2:3], s[6:7]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5320,9 +5320,9 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5353,8 +5353,8 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xc0900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x4090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -5388,8 +5388,8 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5423,8 +5423,8 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, -|v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -5447,9 +5447,9 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5503,8 +5503,8 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5561,9 +5561,9 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5617,8 +5617,8 @@ define double @v_exp2_fneg_fabs_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5689,9 +5689,9 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5722,8 +5722,8 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0xc0900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x4090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
@@ -5757,8 +5757,8 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5792,8 +5792,8 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, -v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -5816,9 +5816,9 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5872,8 +5872,8 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -5930,9 +5930,9 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -5986,8 +5986,8 @@ define double @v_exp2_fneg_f64(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6042,8 +6042,8 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -6056,9 +6056,9 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6117,8 +6117,8 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6168,9 +6168,9 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6219,8 +6219,8 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6270,9 +6270,9 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6321,8 +6321,8 @@ define double @v_exp2_f64_fast(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6371,8 +6371,8 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -6385,9 +6385,9 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6418,8 +6418,8 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -6452,8 +6452,8 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6487,8 +6487,8 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -6511,9 +6511,9 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6567,8 +6567,8 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6625,9 +6625,9 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6681,8 +6681,8 @@ define double @v_exp2_f64_afn(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6736,8 +6736,8 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -6750,9 +6750,9 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6811,8 +6811,8 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6862,9 +6862,9 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -6913,8 +6913,8 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -6964,9 +6964,9 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7015,8 +7015,8 @@ define double @v_exp2_f64_ninf(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7065,8 +7065,8 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -7079,9 +7079,9 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7112,8 +7112,8 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -7146,8 +7146,8 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7181,8 +7181,8 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -7205,9 +7205,9 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7261,8 +7261,8 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7319,9 +7319,9 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7375,8 +7375,8 @@ define double @v_exp2_f64_nnan(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7428,10 +7428,10 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x43300000
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], |v[0:1]|, s[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc3300000
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[2:3], s[6:7]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -7445,9 +7445,9 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7478,8 +7478,8 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -7513,8 +7513,8 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7548,8 +7548,8 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e64 vcc, |v[0:1]|, v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -7572,9 +7572,9 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7628,8 +7628,8 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7686,9 +7686,9 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7742,8 +7742,8 @@ define double @v_fabs_exp2_f64_afn(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7798,8 +7798,8 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -7812,9 +7812,9 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7873,8 +7873,8 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -7924,9 +7924,9 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -7975,8 +7975,8 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8026,9 +8026,9 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8077,8 +8077,8 @@ define double @v_exp2_f64_nnan_ninf(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8169,9 +8169,9 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8202,8 +8202,8 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -8237,33 +8237,33 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x6a5dcb37
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e5ade15
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[12:13]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[2:3]
@@ -8271,10 +8271,10 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -8299,9 +8299,9 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8357,8 +8357,8 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8417,9 +8417,9 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8475,8 +8475,8 @@ define double @v_exp2_f64_from_fpext_f16(half %src) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8546,9 +8546,9 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[5:6], v[5:6], s[4:5], v[7:8]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[5:6], s[4:5], v[7:8]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8579,8 +8579,8 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[7:8], v[5:6], v[7:8], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[3:4], v[5:6], v[7:8], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[3:4], v[3:4], v0
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[1:2]
@@ -8609,37 +8609,37 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3c7abc9e
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xfefa39ef
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3fe62e42
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x6a5dcb37
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[10:11], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v8, v[2:3]
@@ -8649,8 +8649,8 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[6:7]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
@@ -8674,9 +8674,9 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8731,8 +8731,8 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -8790,9 +8790,9 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -8847,8 +8847,8 @@ define double @v_exp2_f64_from_fpext_f32(float %src) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9002,26 +9002,26 @@ define double @v_exp2_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x623fde64
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[4:5], v[8:9], v[12:13]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ec71dee
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7c89e6b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3efa0199
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x14761f6e
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f2a01a0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x1852b7b0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3f56c16c
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x11122322
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3f811111
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x555502a1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fa55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x55555511
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fc55555
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 11
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x3fe00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_cvt_i32_f64_e32 v10, v[2:3]
@@ -9029,10 +9029,10 @@ define double @v_exp2_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x40900000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -9058,9 +9058,9 @@ define double @v_exp2_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9117,8 +9117,8 @@ define double @v_exp2_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9178,9 +9178,9 @@ define double @v_exp2_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9237,8 +9237,8 @@ define double @v_exp2_f64_from_fpext_math_f16(half %src0, half %src1) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9294,8 +9294,8 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -9308,9 +9308,9 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9341,8 +9341,8 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], 1.0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xc090cc00
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-SDAG-NEXT:    v_cmp_nlt_f64_e32 vcc, s[4:5], v[0:1]
@@ -9375,8 +9375,8 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9410,8 +9410,8 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40900000
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_cmp_ngt_f64_e32 vcc, v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc090cc00
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -9434,9 +9434,9 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9490,8 +9490,8 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9548,9 +9548,9 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9604,8 +9604,8 @@ define double @v_exp2_f64_contract(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9659,8 +9659,8 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SI-SDAG-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, -1
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -9673,9 +9673,9 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9734,8 +9734,8 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe62e42
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9785,9 +9785,9 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9836,8 +9836,8 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
@@ -9887,9 +9887,9 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xfefa39ef
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3fe62e42
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], s[4:5], v[6:7]
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, 0xfca7ab0c
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0x3e928af3
+; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x6a5dcb37
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3e5ade15
 ; GFX900-SDAG-NEXT:    v_fma_f64 v[6:7], v[4:5], s[4:5], v[6:7]
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x623fde64
@@ -9938,8 +9938,8 @@ define double @v_exp2_f64_contract_nnan_ninf(double %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[6:7]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x6a5dcb37
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3e5ade15
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0xfca7ab0c
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3e928af3
 ; GFX900-GISEL-NEXT:    v_fma_f64 v[6:7], v[4:5], v[6:7], v[8:9]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, 0x623fde64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 0c8dbe865a872..5cbe3e72ce5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -38,9 +38,9 @@ define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_add_i32 s34, s4, -4
+; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
 ; GFX10-NEXT:    s_min_u32 s36, s4, s34
 ; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
-; GFX10-NEXT:    s_mov_b32 s35, 0xb73e62d9
 ; GFX10-NEXT:    s_lshl_b32 s36, s36, 2
 ; GFX10-NEXT:    s_lshr_b64 s[34:35], s[34:35], s36
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
@@ -50,9 +50,9 @@ define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_add_i32 s0, s4, -4
+; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
 ; GFX11-NEXT:    s_min_u32 s2, s4, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT:    s_mov_b32 s1, 0xb73e62d9
 ; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
@@ -162,8 +162,8 @@ define void @v_set_rounding(i32 %rounding) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, -4, v0
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX6-NEXT:    s_mov_b32 s5, 0xb73e62d9
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
@@ -175,8 +175,8 @@ define void @v_set_rounding(i32 %rounding) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, -4, v0
 ; GFX7-NEXT:    v_min_u32_e32 v0, v0, v1
-; GFX7-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX7-NEXT:    s_mov_b32 s5, 0xb73e62d9
 ; GFX7-NEXT:    v_lshr_b64 v[0:1], s[4:5], v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s4, v0
@@ -188,8 +188,8 @@ define void @v_set_rounding(i32 %rounding) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -4, v0
 ; GFX8-NEXT:    v_min_u32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX8-NEXT:    s_mov_b32 s5, 0xb73e62d9
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
@@ -201,8 +201,8 @@ define void @v_set_rounding(i32 %rounding) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, -4, v0
 ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c84a50f
 ; GFX9-NEXT:    s_mov_b32 s5, 0xb73e62d9
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
@@ -1615,12 +1615,11 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
 ; GFX6-LABEL: get_rounding_after_set_rounding_1:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX6-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
 ; GFX6-NEXT:    s_lshl_b32 s2, s0, 2
 ; GFX6-NEXT:    s_mov_b32 s0, 0xeb24da71
-; GFX6-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX6-NEXT:    s_and_b32 s0, s0, 15
 ; GFX6-NEXT:    s_add_i32 s1, s0, 4
@@ -1636,12 +1635,11 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
 ; GFX7-LABEL: get_rounding_after_set_rounding_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX7-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_nop 0
 ; GFX7-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
 ; GFX7-NEXT:    s_lshl_b32 s2, s0, 2
 ; GFX7-NEXT:    s_mov_b32 s0, 0xeb24da71
-; GFX7-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX7-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX7-NEXT:    s_and_b32 s0, s0, 15
 ; GFX7-NEXT:    s_add_i32 s1, s0, 4
@@ -1657,17 +1655,17 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
 ; GFX8-LABEL: get_rounding_after_set_rounding_1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX8-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
 ; GFX8-NEXT:    s_lshl_b32 s2, s0, 2
 ; GFX8-NEXT:    s_mov_b32 s0, 0xeb24da71
-; GFX8-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    s_add_i32 s1, s0, 4
 ; GFX8-NEXT:    s_cmp_lt_u32 s0, 4
 ; GFX8-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1676,17 +1674,17 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
 ; GFX9-LABEL: get_rounding_after_set_rounding_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
+; GFX9-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
 ; GFX9-NEXT:    s_lshl_b32 s2, s0, 2
 ; GFX9-NEXT:    s_mov_b32 s0, 0xeb24da71
-; GFX9-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX9-NEXT:    s_and_b32 s0, s0, 15
 ; GFX9-NEXT:    s_add_i32 s1, s0, 4
 ; GFX9-NEXT:    s_cmp_lt_u32 s0, 4
 ; GFX9-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1718,13 +1716,13 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
 ; GFX11-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 4)
 ; GFX11-NEXT:    s_mov_b32 s1, 0xc96f385
 ; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 15
 ; GFX11-NEXT:    s_add_i32 s1, s0, 4
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
 ; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 09bdbb28ba2a1..b7ddb8321c68e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -387,8 +387,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -406,8 +405,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-LABEL: local_atomic_fmax_ret_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -484,8 +482,7 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -503,8 +500,7 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-LABEL: local_atomic_fmax_ret_f64__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2] offset:65528
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -583,8 +579,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_max_num_f64 v0, v[1:2]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -602,8 +597,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-LABEL: local_atomic_fmax_noret_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_max_f64 v0, v[1:2]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -680,8 +674,7 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_max_num_f64 v0, v[1:2] offset:65528
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -699,8 +692,7 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-LABEL: local_atomic_fmax_noret_f64__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_max_f64 v0, v[1:2] offset:65528
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 3bb98a2a690ed..81dcd95a64bf6 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -387,8 +387,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -406,8 +405,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-LABEL: local_atomic_fmin_ret_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[1:2]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -484,8 +482,7 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -503,8 +500,7 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-LABEL: local_atomic_fmin_ret_f64__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[1:2] offset:65528
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -583,8 +579,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_min_num_f64 v0, v[1:2]
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -602,8 +597,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX11-LABEL: local_atomic_fmin_noret_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_min_f64 v0, v[1:2]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -680,8 +674,7 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    ds_min_num_f64 v0, v[1:2] offset:65528
 ; GFX12-NEXT:    s_wait_dscnt 0x0
@@ -699,8 +692,7 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
 ; GFX11-LABEL: local_atomic_fmin_noret_f64__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x40100000
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_min_f64 v0, v[1:2] offset:65528
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lrint.ll b/llvm/test/CodeGen/AMDGPU/lrint.ll
index 2f8ea71c1d4be..737ca214d2258 100644
--- a/llvm/test/CodeGen/AMDGPU/lrint.ll
+++ b/llvm/test/CodeGen/AMDGPU/lrint.ll
@@ -209,9 +209,9 @@ define i64 @intrinsic_lrint_i64_f64(double %arg) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
@@ -412,9 +412,9 @@ define i64 @intrinsic_llrint_i64_f64(double %arg) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index d8d8308f6cd8a..b75411571f0f4 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -350,6 +350,7 @@ define i64 @intrinsic_lround_i64_f64(double %arg) {
 ; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
@@ -360,7 +361,6 @@ define i64 @intrinsic_lround_i64_f64(double %arg) {
 ; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
@@ -376,13 +376,13 @@ define i64 @intrinsic_lround_i64_f64(double %arg) {
 ; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
 ; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
 ; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
@@ -641,6 +641,7 @@ define i64 @intrinsic_llround_i64_f64(double %arg) {
 ; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
@@ -651,7 +652,6 @@ define i64 @intrinsic_llround_i64_f64(double %arg) {
 ; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
 ; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
@@ -667,13 +667,13 @@ define i64 @intrinsic_llround_i64_f64(double %arg) {
 ; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
 ; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
 ; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index ddfd7875da41d..09703b4922709 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -2216,9 +2216,9 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
 ;
 ; GFX12-LABEL: lshr_mad_i64_sgpr:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s4, 0xffff1c18
 ; GFX12-NEXT:    s_mov_b32 s3, 0
 ; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s4, 0xffff1c18
 ; GFX12-NEXT:    s_mov_b32 s5, -1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
index 8073420033aa4..a881b83986f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
@@ -38,8 +38,8 @@ define void @memset_pattern_i128_len1(ptr addrspace(1) align 16 %a) {
 ; GFX942-GISEL:       ; %bb.0: ; %memset.pattern-expansion-residual-body
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
-; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
@@ -111,8 +111,8 @@ define void @memset_pattern_i128_constlen_mainloop_and_residual_taken(ptr addrsp
 ; GFX942-GISEL:       ; %bb.0:
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
-; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
 ; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -147,12 +147,12 @@ define void @memset_pattern_i128_constlen_mainloop_and_residual_taken(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX942-GISEL-NEXT:    s_cbranch_vccnz .LBB3_1
 ; GFX942-GISEL-NEXT:  ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, 0x100, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
-; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
 ; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -189,8 +189,8 @@ define void @memset_pattern_i128_len1_lds(ptr addrspace(3) align 16 %a) {
 ; GFX942-GISEL:       ; %bb.0: ; %memset.pattern-expansion-residual-body
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
-; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
@@ -217,8 +217,8 @@ define void @memset_pattern_i128_len1_no_align(ptr addrspace(1) %a) {
 ; GFX942-GISEL:       ; %bb.0: ; %memset.pattern-expansion-residual-body
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
-; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
@@ -273,8 +273,8 @@ define void @memset_pattern_i128_len16(ptr addrspace(1) align 16 %a) {
 ; GFX942-GISEL:       ; %bb.0:
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
-; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
 ; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
 ; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -843,8 +843,8 @@ define void @memset_pattern_i64_as7_dynlen(ptr addrspace(7) inreg align 16 %a, i
 ; GFX942-GISEL-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
 ; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], 1, v[8:9]
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
-; GFX942-GISEL-NEXT:    v_add3_u32 v4, v0, v1, s16
 ; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], 0
+; GFX942-GISEL-NEXT:    v_add3_u32 v4, v0, v1, s16
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0xccccdddd
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0xaaaabbbb
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
index 26a6cc41ad8fa..35f64a8cad934 100644
--- a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -4,8 +4,8 @@
 # CHECK: ; %bb.0:
 # CHECK:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 # CHECK:         s_load_dwordx2 s[0:1], s[4:5], 0x0
-# CHECK:         v_mov_b32_e32 v5, 0
 # CHECK:         v_mov_b32_e32 v4, 0
+# CHECK:         v_mov_b32_e32 v5, 0
 # CHECK:         v_mov_b32_e32 v6, 0
 # CHECK:         s_waitcnt lgkmcnt(0)
 # CHECK:         v_mov_b64_e32 v[2:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
index b7e6ed26876c4..4647f747c3b58 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_wait_kmcnt 0x0
@@ -76,7 +76,6 @@ define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2
 ; CHECK-NEXT:    s_mov_b32 s6, s3
 ; CHECK-NEXT:    s_mov_b32 s7, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
 ; CHECK-NEXT:    s_mov_b32 s5, s2
 ; CHECK-NEXT:    s_mov_b32 s2, s1
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 60743d2cc18c6..ae38099166e46 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -4841,8 +4841,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x7ff
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x7ff
 ; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -4854,8 +4854,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG-FAKE16:       ; %bb.0:
 ; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x7ff
 ; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x7ff
 ; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -4946,8 +4946,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x800
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x800
 ; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -4959,8 +4959,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG-FAKE16:       ; %bb.0:
 ; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x800
 ; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x800
 ; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5051,8 +5051,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5064,8 +5064,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG-FAKE16:       ; %bb.0:
 ; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5156,8 +5156,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5169,8 +5169,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG-FAKE16:       ; %bb.0:
 ; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x1000
 ; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5261,8 +5261,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x1fff
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x1fff
 ; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5274,8 +5274,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG-FAKE16:       ; %bb.0:
 ; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x1fff
 ; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x1fff
 ; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5366,8 +5366,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x2000
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x2000
 ; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -5379,8 +5379,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG-FAKE16:       ; %bb.0:
 ; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x2000
 ; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x2000
 ; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 255cdc455dd3d..9cf59be504cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -2919,11 +2919,11 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; PACKED-SDAG-NEXT:    s_mov_b32 s2, 4.0
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v2, 1.0
+; PACKED-SDAG-NEXT:    v_mov_b32_e32 v3, 2.0
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; PACKED-SDAG-NEXT:    v_mov_b32_e32 v3, 2.0
+; PACKED-SDAG-NEXT:    s_mov_b32 s2, 4.0
 ; PACKED-SDAG-NEXT:    s_mov_b32 s3, 0x40400000
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
@@ -2936,10 +2936,10 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
-; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, 2.0
 ; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX90A-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
 ; GFX90A-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -2953,10 +2953,10 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
-; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 2.0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX942-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 6084381da84f9..a482292b30c72 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -379,8 +379,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2800
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0x7f
 ; GFX8-NEXT:    s_movk_i32 s1, 0x800
@@ -505,8 +505,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2800, v0
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v7, 0x7f
 ; GFX900-NEXT:    s_movk_i32 s2, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index ec8ba1dc65459..1d249feeb65c0 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -3789,8 +3789,8 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX900-LABEL: v_mul_284_add_82_i64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0x52
 ; GFX900-NEXT:    s_movk_i32 s6, 0x11c
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x52
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
@@ -3800,8 +3800,8 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX90A-LABEL: v_mul_284_add_82_i64:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x52
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x11c
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x52
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5]
@@ -3872,8 +3872,8 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX900-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7da667
 ; GFX900-NEXT:    s_mov_b32 s6, 0x37b4a145
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7da667
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
@@ -3883,8 +3883,8 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX90A-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x7da667
 ; GFX90A-NEXT:    s_mov_b32 s6, 0x37b4a145
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x7da667
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 35c318e9be4f7..002af2e30ddd2 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -111,10 +111,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v13, v[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v13
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, v9, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -140,11 +140,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v28, vcc, -1, v23
 ; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v22, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v30, vcc, -1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v12, 0
-; GFX9-NEXT:    v_mov_b32_e32 v18, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v31, vcc, -1, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX9-NEXT:  .LBB0_3: ; %udiv-do-while
@@ -1544,10 +1544,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v15
-; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, v11, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[4:5]
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1573,11 +1573,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_co_u32_e32 v26, vcc, -1, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, -1, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v14, 0
-; GFX9-NEXT:    v_mov_b32_e32 v20, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v7, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v21, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:  .LBB1_3: ; %udiv-do-while
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 8920bfbd3b9dc..44c860bd64033 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -1154,8 +1154,8 @@ define double @v_roundeven_f64(double %x) {
 ; SDAG_GFX6-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SDAG_GFX6-NEXT:    v_bfi_b32 v3, s6, v2, v1
 ; SDAG_GFX6-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG_GFX6-NEXT:    s_mov_b32 s4, -1
 ; SDAG_GFX6-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; SDAG_GFX6-NEXT:    s_mov_b32 s4, -1
 ; SDAG_GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
 ; SDAG_GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
 ; SDAG_GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 05bdac942ba64..db7ebebe540a4 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -87,11 +87,11 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
@@ -351,11 +351,11 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-GISEL-IR-NEXT:    s_and_b32 s0, s1, 0x7fffffff
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
@@ -612,11 +612,11 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
@@ -876,11 +876,11 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-GISEL-IR-NEXT:    s_xor_b32 s0, s1, 0x80000000
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s0
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
@@ -3278,12 +3278,12 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-IR-NEXT:    v_rsq_f64_e64 v[2:3], -|v[0:1]|
 ; SI-GISEL-IR-NEXT:    v_cmp_eq_f64_e64 vcc, -|v[0:1]|, 0
 ; SI-GISEL-IR-NEXT:    v_or_b32_e32 v4, 0x80000000, v1
+; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; SI-GISEL-IR-NEXT:    v_mul_f64 v[0:1], v[0:1], -v[2:3]
 ; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], 1.0
-; SI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; SI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[0:1], v[2:3]
 ; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], 0.5
 ; SI-GISEL-IR-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[2:3]
@@ -5389,11 +5389,11 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) {
 ; VI-GISEL-IR-NEXT:    v_cmp_class_f64_e32 vcc, s[0:1], v2
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, s1
+; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v2, v3, v0, vcc
 ; VI-GISEL-IR-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[2:3], v[2:3], -v[0:1]
 ; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-IR-NEXT:    v_mov_b32_e32 v5, 0x3fd80000
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[0:1], 1.0
 ; VI-GISEL-IR-NEXT:    v_mul_f64 v[6:7], v[2:3], v[0:1]
 ; VI-GISEL-IR-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], 0.5
@@ -6548,9 +6548,9 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40700000
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40700000
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index fdb20f372ab8d..8215b24b27b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -430,8 +430,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_not_b32_e32 v4, v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, v4, v9
 ; GCN-IR-NEXT:    v_addc_u32_e64 v17, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
@@ -1488,8 +1488,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v14, vcc, 58, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], 24, v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
@@ -1681,8 +1681,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v14, vcc, 47, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[8:9], v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
@@ -1777,8 +1777,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 0xffffffcf, v6
 ; GCN-IR-NEXT:    v_lshr_b64 v[4:5], v[4:5], v7
 ; GCN-IR-NEXT:    v_addc_u32_e64 v11, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 3a2d056dc504a..942ad6f5f6390 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -185,8 +185,8 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -221,8 +221,8 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -257,8 +257,8 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, 64, s4
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
@@ -445,8 +445,8 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[8:9], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
@@ -520,8 +520,8 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[8:9], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
@@ -595,8 +595,8 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[8:9], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
index b0575440a76ad..39f594d375b50 100644
--- a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
+++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
@@ -5,8 +5,8 @@ define amdgpu_kernel void @foo(ptr %0) {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 02d2e6c1473ab..8880bc9bb2057 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -411,8 +411,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, v6, v11
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
 ; GCN-IR-NEXT:    v_addc_u32_e64 v17, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
@@ -1637,8 +1637,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, 58, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], 24, v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
@@ -1828,8 +1828,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, 47, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[8:9], v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
@@ -1930,8 +1930,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 0xffffffcf, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[0:1], v6
 ; GCN-IR-NEXT:    v_addc_u32_e64 v13, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 1130c465c15e3..f16f1202c3991 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -18,12 +18,12 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s0, s[0:1], 0x0
+; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    s_mov_b32 s2, 0
-; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
+; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 03b653782e5ca..2221647f580cb 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -45,9 +45,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s54, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
@@ -77,11 +77,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
 ; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
 ; GLOBALNESS1-NEXT:    s_mov_b32 s83, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s84, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
@@ -179,10 +179,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
@@ -360,9 +360,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s54, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
@@ -392,11 +392,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
 ; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS0-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s82, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
@@ -494,10 +494,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[86:87], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[52:53], s[86:87]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_25
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 1f93bf7a68972..dd2acb8de6f41 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2450,10 +2450,10 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ; GFX1030:       ; %bb.0: ; %bb
 ; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    global_load_sbyte v2, v0, s[0:1]
 ; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
 ; GFX1030-NEXT:    s_waitcnt vmcnt(1)
 ; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 1c50f930facba..41660234562f6 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -350,8 +350,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_not_b32_e32 v6, v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, v6, v9
 ; GCN-IR-NEXT:    v_addc_u32_e64 v13, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
@@ -1110,8 +1110,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, 47, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[8:9], v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
@@ -1195,8 +1195,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffcf, v6
 ; GCN-IR-NEXT:    v_lshr_b64 v[0:1], v[0:1], v7
 ; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
@@ -1394,8 +1394,8 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffc4, v6
 ; GCN-IR-NEXT:    v_lshr_b64 v[0:1], v[0:1], v7
 ; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 28e6627b87413..da318571dbdfa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -386,8 +386,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, v6, v11
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
 ; GCN-IR-NEXT:    v_addc_u32_e64 v15, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
@@ -1246,8 +1246,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, 47, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[8:9], v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
@@ -1337,8 +1337,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 0xffffffcf, v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[0:1], v6
 ; GCN-IR-NEXT:    v_addc_u32_e64 v11, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir
index c5e2cbf540585..b0e7351959e80 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir
@@ -2,8 +2,8 @@
 
 ---
 # GCN-LABEL: name: expand_imm64_sext_shrink_to_bfrev
-# GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
-# GCN: $vgpr1 = V_BFREV_B32_e32 1, implicit $exec, implicit-def $vgpr0_vgpr1
+# GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0
+# GCN: $vgpr1 = V_BFREV_B32_e32 1, implicit $exec, implicit-def $vgpr1
 name:            expand_imm64_sext_shrink_to_bfrev
 tracksRegLiveness: true
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 4c68c4519302a..96d3f3a0da66d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -4,8 +4,8 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass postrapseudos %s -o - | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 # GCN-LABEL: name: v_mov_b64_from_vgpr
-# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0
+# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr1
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec
@@ -16,8 +16,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_sgpr
-# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0
+# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr1
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr2_sgpr3, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr2_sgpr3, implicit $exec
@@ -28,10 +28,10 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_sext_inline_imm
-# GFX900: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0
+# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
+# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0
+# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 -2, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -2, implicit $exec
 name: v_mov_b64_from_sext_inline_imm
@@ -41,8 +41,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_lit
-# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec, implicit-def $vgpr0_vgpr1
-# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -232831, implicit $exec, implicit-def $vgpr0_vgpr1
+# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec, implicit-def $vgpr0
+# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -232831, implicit $exec, implicit-def $vgpr1
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -1000000100000002, implicit $exec
 name: v_mov_b64_from_lit
 body: |
@@ -51,8 +51,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_first_inline_imm
-# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
-# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1
+# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0
+# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr1
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 1152921504606846975, implicit $exec
 name: v_mov_b64_from_first_inline_imm
 body: |
@@ -61,8 +61,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_second_inline_imm
-# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1
-# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
+# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0
+# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -4026531841, implicit $exec
 name: v_mov_b64_from_second_inline_imm
 body: |
@@ -71,8 +71,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_same_sext_inline_imm
-# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0
+# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, -1, 8, -1, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 -1, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -1, implicit $exec
@@ -83,8 +83,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_same_fp_inline_imm
-# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1
-# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0
+# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr1
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 4575657222473777152, implicit $exec
@@ -95,8 +95,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_misalign
-# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
-# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5
+# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr6
 name: v_mov_b64_misalign
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 9dbeba2a095d4..fbacb53f2858e 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -298,7 +298,6 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    scratch_store_b32 off, v93, s33 offset:404 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s1
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x60
 ; CHECK-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; CHECK-NEXT:    scratch_store_b32 off, v40, s33 offset:144
 ; CHECK-NEXT:    scratch_store_b32 off, v41, s33 offset:140
@@ -339,30 +338,28 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    scratch_store_b32 off, v110, s33 offset:4
 ; CHECK-NEXT:    scratch_store_b32 off, v111, s33
 ; CHECK-NEXT:    v_dual_mov_b32 v92, v31 :: v_dual_mov_b32 v1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x50
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v3, 0
+; CHECK-NEXT:    v_dual_mov_b32 v2, 0x50 :: v_dual_mov_b32 v5, 0
+; CHECK-NEXT:    v_dual_mov_b32 v4, 64 :: v_dual_mov_b32 v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 48
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_b128 v[56:59], v[0:1], off
 ; CHECK-NEXT:    global_load_b128 v[104:107], v[2:3], off
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_dual_mov_b32 v6, 48 :: v_dual_mov_b32 v1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x70
-; CHECK-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v8, 0x80
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x70
+; CHECK-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0x80
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_b128 v[108:111], v[4:5], off
+; CHECK-NEXT:    global_load_b128 v[60:63], v[6:7], off
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_clause 0x4
 ; CHECK-NEXT:    global_load_b128 v[72:75], v[0:1], off
 ; CHECK-NEXT:    global_load_b128 v[10:13], v[2:3], off
 ; CHECK-NEXT:    global_load_b128 v[14:17], v[8:9], off
 ; CHECK-NEXT:    global_load_b128 v[18:21], v[8:9], off offset:16
 ; CHECK-NEXT:    global_load_b128 v[22:25], v[8:9], off offset:32
-; CHECK-NEXT:    v_dual_mov_b32 v4, 64 :: v_dual_mov_b32 v7, 0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_b128 v[108:111], v[4:5], off
-; CHECK-NEXT:    global_load_b128 v[60:63], v[6:7], off
-; CHECK-NEXT:    v_mov_b32_e32 v4, 32
-; CHECK-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 16
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_dual_mov_b32 v4, 32 :: v_dual_mov_b32 v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 16
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_b128 v[76:79], v[4:5], off
 ; CHECK-NEXT:    global_load_b128 v[88:91], v[6:7], off
@@ -374,13 +371,13 @@ define <8 x half> @baz() nounwind {
 ; CHECK-NEXT:    s_add_co_u32 s0, s0, foo at gotpcrel32@lo+12
 ; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; CHECK-NEXT:    s_add_co_ci_u32 s1, s1, foo at gotpcrel32@hi+24
-; CHECK-NEXT:    s_wait_loadcnt 0x7
+; CHECK-NEXT:    s_wait_loadcnt 0x5
 ; CHECK-NEXT:    scratch_store_b128 off, v[10:13], s33 offset:148 ; 16-byte Folded Spill
-; CHECK-NEXT:    s_wait_loadcnt 0x6
+; CHECK-NEXT:    s_wait_loadcnt 0x4
 ; CHECK-NEXT:    scratch_store_b128 off, v[14:17], s33 offset:164 ; 16-byte Folded Spill
-; CHECK-NEXT:    s_wait_loadcnt 0x5
+; CHECK-NEXT:    s_wait_loadcnt 0x3
 ; CHECK-NEXT:    scratch_store_b128 off, v[18:21], s33 offset:180 ; 16-byte Folded Spill
-; CHECK-NEXT:    s_wait_loadcnt 0x4
+; CHECK-NEXT:    s_wait_loadcnt 0x2
 ; CHECK-NEXT:    s_clause 0x4 ; 80-byte Folded Spill
 ; CHECK-NEXT:    scratch_store_b128 off, v[22:25], s33 offset:196
 ; CHECK-NEXT:    scratch_store_b128 off, v[26:29], s33 offset:212
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 06f28f05c20be..34f4abeee405a 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -871,7 +871,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_i32 s3, s1, 1
 ; GFX1032-NEXT:    s_cmp_ge_u32 s2, s0
 ; GFX1032-NEXT:    s_cselect_b32 s4, s3, s1
-; GFX1032-NEXT:  .LBB15_3:
+; GFX1032-NEXT:  .LBB15_3: ; %bb.split
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_add_i32 s3, s1, 1
 ; GFX1064-NEXT:    s_cmp_ge_u32 s2, s0
 ; GFX1064-NEXT:    s_cselect_b32 s4, s3, s1
-; GFX1064-NEXT:  .LBB15_3:
+; GFX1064-NEXT:  .LBB15_3: ; %bb.split
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s5
@@ -1692,8 +1692,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, s4
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s2, s4
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1032-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX1032-NEXT:    s_endpgm
@@ -1706,8 +1706,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, s[4:5]
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s2, s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1064-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX1064-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index be2c891dd69fd..88f70cca245f8 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -37,14 +37,14 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-LABEL: widen_i16_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
@@ -89,7 +89,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
 ; GFX11-LABEL: widen_i16_constant_load_zext_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -98,7 +98,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
@@ -144,7 +144,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
 ; GFX11-LABEL: widen_i16_constant_load_sext_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -153,7 +153,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
@@ -207,18 +207,18 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-LABEL: widen_i17_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_i32 s0, s0, 34
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x1ffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v5, s0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_d16_hi_b8 v[2:3], v5, off
@@ -261,8 +261,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-TRUE16-LABEL: widen_f16_constant_load:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -273,8 +272,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-FAKE16-LABEL: widen_f16_constant_load:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -331,7 +329,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-LABEL: widen_v2i8_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xc0c0104
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0xc0c0104 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -343,8 +341,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    v_perm_b32 v0, s1, s0, v0
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x2c00, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x300, v2
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -382,10 +379,10 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v2, 0x3e7, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_or_b32_e32 v2, 4, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -408,7 +405,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; GFX11-FAKE16-LABEL: no_widen_i16_constant_divergent_load:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -416,8 +413,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x3e7, v0
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 4, v2
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-FAKE16-NEXT:    s_endpgm
@@ -462,13 +458,13 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-LABEL: widen_i1_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i1, ptr addrspace(4) %arg, align 4
@@ -512,7 +508,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
 ; GFX11-LABEL: widen_i16_zextload_i64_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -521,7 +517,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
@@ -567,14 +563,15 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
 ; GFX11-LABEL: widen_i1_zext_to_i64_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_endpgm
   %load = load i1, ptr addrspace(4) %arg, align 4
@@ -620,14 +617,14 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(6) %arg, align 4
@@ -670,14 +667,14 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
 ; GFX11-LABEL: widen_i16_global_invariant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index cfebf404fe925..a5f53e662b421 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -840,8 +840,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[34:35]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 8b3bf0290ec30..65ebf07b21267 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -829,8 +829,8 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
@@ -1688,8 +1688,8 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7

>From d8105dc144ae0acdd8fb28ac35d5c8241521c2aa Mon Sep 17 00:00:00 2001
From: Joseph Nash <joseph.nash at amd.com>
Date: Fri, 3 Apr 2026 14:03:08 -0400
Subject: [PATCH 2/3] Remove implicit def completely

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 24 ++----
 .../AMDGPU/av_movimm_pseudo_expansion.mir     | 76 +++++++++----------
 .../CodeGen/AMDGPU/inflate-av-remat-imm.mir   | 20 ++---
 .../AMDGPU/v_mov_b64_expand_and_shrink.mir    |  4 +-
 .../CodeGen/AMDGPU/v_mov_b64_expansion.mir    | 40 +++++-----
 5 files changed, 78 insertions(+), 86 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ab044f2542e9a..aaabcb71fce5e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2162,11 +2162,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
       Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
-          .addImm(SignExtend64<32>(Imm))
-          .addReg(DstLo, RegState::Implicit | RegState::Define);
+          .addImm(SignExtend64<32>(Imm));
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
-          .addImm(SignExtend64<32>(Imm >> 32))
-          .addReg(DstHi, RegState::Implicit | RegState::Define);
+          .addImm(SignExtend64<32>(Imm >> 32));
       MI.eraseFromParent();
       break;
     }
@@ -2211,11 +2209,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
             .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-          .addImm(Lo.getSExtValue())
-          .addReg(DstLo, RegState::Implicit | RegState::Define);
+          .addImm(Lo.getSExtValue());
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-          .addImm(Hi.getSExtValue())
-          .addReg(DstHi, RegState::Implicit | RegState::Define);
+          .addImm(Hi.getSExtValue());
       }
     } else {
       assert(SrcOp.isReg());
@@ -2233,11 +2229,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
           .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
-          .addReg(DstLo, RegState::Implicit | RegState::Define);
+          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
-          .addReg(DstHi, RegState::Implicit | RegState::Define);
+          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
       }
     }
     MI.eraseFromParent();
@@ -2269,11 +2263,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     APInt Lo(32, Imm.getLoBits(32).getZExtValue());
     APInt Hi(32, Imm.getHiBits(32).getZExtValue());
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
-      .addImm(Lo.getSExtValue())
-      .addReg(DstLo, RegState::Implicit | RegState::Define);
+      .addImm(Lo.getSExtValue());
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
-      .addImm(Hi.getSExtValue())
-      .addReg(DstHi, RegState::Implicit | RegState::Define);
+      .addImm(Hi.getSExtValue());
     MI.eraseFromParent();
     break;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
index 5286ee30cdbf5..d170aa5eec68f 100644
--- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
@@ -62,8 +62,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_0
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
 ...
 
@@ -73,8 +73,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_neg1
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO -1, implicit $exec
 ...
 
@@ -84,8 +84,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
 ...
 
@@ -95,8 +95,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
-    ; GFX908: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0
-    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ;
     ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
     ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec
@@ -112,12 +112,12 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
-    ; GFX908: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0
-    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ;
     ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
-    ; GFX90A: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0
-    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ;
     ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
     ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 64, implicit $exec
@@ -130,8 +130,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64_hi_0_lo
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 274877906944, implicit $exec
 ...
 
@@ -141,8 +141,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64_hi_2_lo
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 274877906946, implicit $exec
 ...
 
@@ -152,8 +152,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_neg16_hi_9_lo
-    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr1
+    ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec
     $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
 
@@ -163,24 +163,24 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
-    ; GFX908: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0
-    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
-    ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2
-    ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr3
-    ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr4
-    ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr5
+    ; GFX908: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec
+    ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 1042479491, implicit $exec
+    ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 1042479491, implicit $exec
     ;
     ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
-    ; GFX90A: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0
-    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
-    ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2
-    ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr3
+    ; GFX90A: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec
     ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, 1042479491, 8, 1042479491, 0, 0, 0, 0, 0, implicit $exec
     ;
     ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
     ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 1042479491, implicit $exec
-    ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2
-    ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr3
+    ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec
     ; GFX942-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, 1042479491, 8, 1042479491, 0, 0, 0, 0, 0, implicit $exec
     $vgpr0_vgpr1 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
     $vgpr2_vgpr3 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
@@ -193,8 +193,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_unaligned_agpr
-    ; CHECK: $agpr1 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr1
-    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr2
+    ; CHECK: $agpr1 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec
+    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec
     $agpr1_agpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
 
@@ -204,8 +204,8 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_unaligned_vgpr
-    ; CHECK: $vgpr1 = V_MOV_B32_e32 9, implicit $exec, implicit-def $vgpr1
-    ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr2
+    ; CHECK: $vgpr1 = V_MOV_B32_e32 9, implicit $exec
+    ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec
     $vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
 
@@ -214,8 +214,8 @@ name: av_mov_b64_misalign_vgpr
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
-    ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5
-    ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr6
+    ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec
     $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
 ...
 
@@ -224,7 +224,7 @@ name: av_mov_b64_misalign_agpr
 body: |
   bb.0:
     ; CHECK-LABEL: name: av_mov_b64_misalign_agpr
-    ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5
-    ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr6
+    ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
     $agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
index 029529671781f..000bea6df1438 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
@@ -120,18 +120,18 @@ body:             |
     ; CHECK-LABEL: name: av_mov_b64_split
     ; CHECK: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $vgpr0, $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0
-    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr1
-    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec, implicit-def $agpr2
-    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr3
-    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr4
-    ; CHECK-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5
-    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 3, implicit $exec, implicit-def $vgpr0
-    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    ; CHECK-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 3, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
     ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
-    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec, implicit-def $vgpr0
-    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
     ; CHECK-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir
index b0e7351959e80..185310c5e2f7b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir
@@ -2,8 +2,8 @@
 
 ---
 # GCN-LABEL: name: expand_imm64_sext_shrink_to_bfrev
-# GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0
-# GCN: $vgpr1 = V_BFREV_B32_e32 1, implicit $exec, implicit-def $vgpr1
+# GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+# GCN: $vgpr1 = V_BFREV_B32_e32 1, implicit $exec
 name:            expand_imm64_sext_shrink_to_bfrev
 tracksRegLiveness: true
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 96d3f3a0da66d..970015cd49d7d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -4,8 +4,8 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass postrapseudos %s -o - | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 # GCN-LABEL: name: v_mov_b64_from_vgpr
-# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0
-# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec
+# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec
@@ -16,8 +16,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_sgpr
-# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0
-# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec
+# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr2_sgpr3, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr2_sgpr3, implicit $exec
@@ -28,10 +28,10 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_sext_inline_imm
-# GFX900: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0
-# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
-# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0
-# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec
+# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec
+# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec
+# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 -2, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -2, implicit $exec
 name: v_mov_b64_from_sext_inline_imm
@@ -41,8 +41,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_lit
-# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec, implicit-def $vgpr0
-# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -232831, implicit $exec, implicit-def $vgpr1
+# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec
+# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -232831, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -1000000100000002, implicit $exec
 name: v_mov_b64_from_lit
 body: |
@@ -51,8 +51,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_first_inline_imm
-# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0
-# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr1
+# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec
+# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 1152921504606846975, implicit $exec
 name: v_mov_b64_from_first_inline_imm
 body: |
@@ -61,8 +61,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_second_inline_imm
-# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0
-# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
+# NOT-GFX1250: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec
+# NOT-GFX1250: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -4026531841, implicit $exec
 name: v_mov_b64_from_second_inline_imm
 body: |
@@ -71,8 +71,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_same_sext_inline_imm
-# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0
-# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec
+# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, -1, 8, -1, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 -1, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 -1, implicit $exec
@@ -83,8 +83,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_from_same_fp_inline_imm
-# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0
-# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr1
+# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
+# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec
 # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
 # GFX942: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec
 # GFX1250: $vgpr0_vgpr1 = V_MOV_B64_e32 4575657222473777152, implicit $exec
@@ -95,8 +95,8 @@ body: |
 ...
 
 # GCN-LABEL: name: v_mov_b64_misalign
-# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5
-# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr6
+# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec
+# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec
 name: v_mov_b64_misalign
 body: |
   bb.0:

>From e86da000b31025b5b319de3fd0b5b0fb1e6fd725 Mon Sep 17 00:00:00 2001
From: Joseph Nash <joseph.nash at amd.com>
Date: Fri, 3 Apr 2026 14:14:51 -0400
Subject: [PATCH 3/3] clang format

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index aaabcb71fce5e..3179c65340d18 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2209,9 +2209,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
             .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-          .addImm(Lo.getSExtValue());
+            .addImm(Lo.getSExtValue());
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-          .addImm(Hi.getSExtValue());
+            .addImm(Hi.getSExtValue());
       }
     } else {
       assert(SrcOp.isReg());
@@ -2229,9 +2229,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
           .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
-          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
+            .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
-          .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
+            .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
       }
     }
     MI.eraseFromParent();
@@ -2263,9 +2263,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     APInt Lo(32, Imm.getLoBits(32).getZExtValue());
     APInt Hi(32, Imm.getHiBits(32).getZExtValue());
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
-      .addImm(Lo.getSExtValue());
+        .addImm(Lo.getSExtValue());
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
-      .addImm(Hi.getSExtValue());
+        .addImm(Hi.getSExtValue());
     MI.eraseFromParent();
     break;
   }



More information about the llvm-commits mailing list