[llvm] PeepholeOpt: Do not add subregister indexes to reg_sequence operands (PR #124111)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 30 05:39:24 PST 2025


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/124111

>From d1e1edd807382d02698506b9854479d5921655c6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 23 Jan 2025 12:52:48 +0700
Subject: [PATCH] PeepholeOpt: Do not add subregister indexes to reg_sequence
 operands

Given the rest of the pass just gives up when it needs to compose
subregisters, folding a subregister extract directly into a reg_sequence
is counterproductive. Later fold attempts in the function will give up
on the subregister operand, preventing looking up through the reg_sequence.

It may still be profitable to do these folds if we start handling
the composes. There are some test regressions, but this mostly
looks better.
---
 llvm/lib/CodeGen/PeepholeOptimizer.cpp        |   6 +
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 224 ++++-----
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |   8 +-
 .../atomic_optimizations_global_pointer.ll    | 104 ++--
 .../atomic_optimizations_local_pointer.ll     |  34 +-
 .../CodeGen/AMDGPU/div-rem-by-constant-64.ll  | 312 +++++-------
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        | 137 +++---
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        | 400 ++++++++--------
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |  74 +--
 llvm/test/CodeGen/AMDGPU/idot2.ll             | 182 +++----
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |  22 +-
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    | 160 +++----
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         | 125 +++--
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  | 142 +++---
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  16 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |  22 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   | 453 ++++++++++--------
 ...uffer-fat-pointers-nontemporal-metadata.ll |   8 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         | 124 +++--
 .../AMDGPU/move-to-valu-atomicrmw-system.ll   |  19 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               | 204 ++++----
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |  60 ++-
 llvm/test/CodeGen/AMDGPU/sdiv.ll              |  40 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 316 ++++++------
 llvm/test/CodeGen/AMDGPU/spill-vgpr.ll        |   8 +-
 llvm/test/CodeGen/AMDGPU/sra.ll               |  68 +--
 llvm/test/CodeGen/AMDGPU/udiv.ll              |  12 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |   9 +-
 28 files changed, 1605 insertions(+), 1684 deletions(-)

diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 48c25d5039bfd4b..af4f2dc49b690b6 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -436,6 +436,12 @@ class RegSequenceRewriter : public Rewriter {
     if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
       return false;
 
+    // Do not introduce new subregister uses in a reg_sequence. Until composing
+    // subregister indices is supported while folding, we're just blocking
+    // folding of subregister copies later in the function.
+    if (NewSubReg)
+      return false;
+
     MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
     MO.setReg(NewReg);
     MO.setSubReg(NewSubReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 40f29c56c8f1272..d41601cc0d76e45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1635,6 +1635,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s11
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
@@ -1682,33 +1683,32 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT:    v_add3_u32 v6, v3, v2, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v6, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s10, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[1:2]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v7, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, s11, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s8, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v3, v12, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s18, s6
 ; GFX9-NEXT:    s_addc_u32 s1, s19, s6
@@ -1716,116 +1716,116 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v15
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v16
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v15
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s8, v8
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v2
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_sub_u32 s5, 0, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v17, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v4
 ; GFX9-NEXT:    s_subb_u32 s20, 0, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v1
-; GFX9-NEXT:    v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v13, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v12, v[3:4]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[18:19], s20, v17, v[3:4]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v15, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v17, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v10, v17, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v3
+; GFX9-NEXT:    v_mul_hi_u32 v2, v12, v2
+; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v8, v17, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v10, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, v10, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v3, v8, v7, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v17, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v12, v3, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v7, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v1, v9, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v8, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v10, s17, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s17
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v7, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v11, s17
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v7, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v10, v11, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
-; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, v8, v3
+; GFX9-NEXT:    v_mul_hi_u32 v2, v8, v2
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v7, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, v8, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, v10, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v7, v5, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
+; GFX9-NEXT:    v_add3_u32 v3, v6, v5, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s8, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, s8, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s9, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, s9, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v3
+; GFX9-NEXT:    v_xor_b32_e32 v9, s4, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
-; GFX9-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT:    v_mov_b32_e32 v7, s4
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
 ; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v7, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 4f04c15b3d44ab3..8e16889c72e65a2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -10041,11 +10041,9 @@ define i64 @udiv_i64_gt_smax(i8 %size) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0xcccccccc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v1
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
 ; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index adc91d56c3c278d..6166c05c6f8959d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2216,31 +2216,31 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_clause 0x1
 ; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1264-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1264-NEXT:    s_mov_b32 s11, 0
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264-NEXT:    s_mov_b32 s11, 0
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1264-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[8:9]
+; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[6:7]
+; GFX1264-NEXT:    s_mov_b32 s15, 0x31016000
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mul_u64 s[8:9], s[4:5], s[10:11]
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_mul_u64 s[6:7], s[4:5], s[10:11]
+; GFX1264-NEXT:    s_mov_b32 s14, -1
 ; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s9
-; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    s_mov_b32 s8, s2
-; GFX1264-NEXT:    s_mov_b32 s9, s3
-; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1264-NEXT:    s_mov_b32 s12, s2
+; GFX1264-NEXT:    s_mov_b32 s13, s3
+; GFX1264-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB4_2:
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1264-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
@@ -5600,17 +5600,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:  .LBB10_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v2, 0
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v2, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s8, v3
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s7, v2, v[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v0, vcc
+; GFX9-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i64_uniform:
@@ -5651,10 +5651,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -5695,10 +5694,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s7, v2, v[4:5]
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
@@ -5742,9 +5740,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
 ; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
-; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
@@ -5788,9 +5785,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
 ; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
-; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX1132-NEXT:    s_endpgm
@@ -5800,31 +5796,31 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_clause 0x1
 ; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1264-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1264-NEXT:    s_mov_b32 s11, 0
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264-NEXT:    s_mov_b32 s11, 0
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1264-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[8:9]
+; GFX1264-NEXT:    s_bcnt1_i32_b64 s10, s[6:7]
+; GFX1264-NEXT:    s_mov_b32 s15, 0x31016000
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mul_u64 s[8:9], s[4:5], s[10:11]
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_mul_u64 s[6:7], s[4:5], s[10:11]
+; GFX1264-NEXT:    s_mov_b32 s14, -1
 ; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s9
-; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    s_mov_b32 s8, s2
-; GFX1264-NEXT:    s_mov_b32 s9, s3
-; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1264-NEXT:    s_mov_b32 s12, s2
+; GFX1264-NEXT:    s_mov_b32 s13, s3
+; GFX1264-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB10_2:
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1264-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    v_mad_co_u64_u32 v[3:4], null, s4, v2, 0
 ; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
@@ -5833,9 +5829,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
 ; GFX1264-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1264-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX1264-NEXT:    s_endpgm
@@ -5877,9 +5872,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
 ; GFX1232-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1232-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX1232-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 45b161d7959f4fc..8062dbbca73932b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -5235,19 +5235,19 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX9-NEXT:  .LBB13_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s1, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s8, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v0, vcc
+; GFX9-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i64_uniform:
@@ -5283,9 +5283,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v4, vcc
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -5321,9 +5320,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v4, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
@@ -5362,9 +5360,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
-; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v5, vcc
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
@@ -5402,9 +5399,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
-; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v5, vcc_lo
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX1132-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 1c6808b613427e2..0c5b67580c35206 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -15,16 +15,14 @@ define noundef i64 @srem64_3(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x55555555
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, v4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v4, s6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, s6
+; GFX9-NEXT:    v_mul_lo_u32 v5, v4, s7
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3]
 ; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
@@ -61,10 +59,9 @@ define noundef i64 @srem64_3(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: srem64_3:
@@ -73,14 +70,12 @@ define noundef i64 @srem64_3(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x55555556, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5]
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX1030-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; GFX1030-NEXT:    v_mul_lo_u32 v5, 0x55555555, v4
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v2
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
+; GFX1030-NEXT:    v_mul_lo_u32 v5, 0x55555555, v4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3]
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555556, v4, v[2:3]
 ; GFX1030-NEXT:    v_mul_lo_u32 v4, 0x55555556, v4
@@ -107,16 +102,14 @@ define noundef i64 @srem64_6(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x55555555
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, v4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v4, s6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, s6
+; GFX9-NEXT:    v_mul_lo_u32 v5, v4, s7
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3]
 ; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
@@ -153,10 +146,9 @@ define noundef i64 @srem64_6(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: srem64_6:
@@ -165,14 +157,12 @@ define noundef i64 @srem64_6(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x55555556, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5]
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX1030-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; GFX1030-NEXT:    v_mul_lo_u32 v5, 0x55555555, v4
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v2
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
+; GFX1030-NEXT:    v_mul_lo_u32 v5, 0x55555555, v4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3]
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555556, v4, v[2:3]
 ; GFX1030-NEXT:    v_mul_lo_u32 v4, 0x55555556, v4
@@ -199,11 +189,9 @@ define noundef i64 @urem64_3(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0xaaaaaaaa
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
 ; GFX9-NEXT:    v_alignbit_b32 v2, v3, v2, 1
@@ -235,10 +223,9 @@ define noundef i64 @urem64_3(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: urem64_3:
@@ -247,11 +234,9 @@ define noundef i64 @urem64_3(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0xaaaaaaab, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3]
 ; GFX1030-NEXT:    v_alignbit_b32 v2, v3, v2, 1
@@ -276,11 +261,9 @@ define noundef i64 @urem64_6(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0xaaaaaaaa
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
 ; GFX9-NEXT:    v_alignbit_b32 v2, v3, v2, 2
@@ -312,10 +295,9 @@ define noundef i64 @urem64_6(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 6, v[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: urem64_6:
@@ -324,11 +306,9 @@ define noundef i64 @urem64_6(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0xaaaaaaab, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3]
 ; GFX1030-NEXT:    v_alignbit_b32 v2, v3, v2, 2
@@ -353,15 +333,13 @@ define noundef i64 @sdiv64_3(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x55555555
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s6
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[2:3]
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v1, v4
@@ -400,16 +378,14 @@ define noundef i64 @sdiv64_3(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x55555556, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v0
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3]
 ; GFX1030-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3]
 ; GFX1030-NEXT:    v_mul_lo_u32 v4, 0x55555555, v0
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mul_lo_u32 v5, 0x55555556, v0
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3]
 ; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0x55555556, v0, v[2:3]
 ; GFX1030-NEXT:    v_add3_u32 v1, v5, v1, v4
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
@@ -430,15 +406,13 @@ define noundef i64 @sdiv64_6(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x55555555
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s7, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s7, v[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s6
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[2:3]
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v1, v4
@@ -477,16 +451,14 @@ define noundef i64 @sdiv64_6(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x55555556, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0x55555556, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v0
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v0, v[2:3]
 ; GFX1030-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3]
 ; GFX1030-NEXT:    v_mul_lo_u32 v4, 0x55555555, v0
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mul_lo_u32 v5, 0x55555556, v0
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x55555555, v1, v[2:3]
 ; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0x55555556, v0, v[2:3]
 ; GFX1030-NEXT:    v_add3_u32 v1, v5, v1, v4
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
@@ -507,11 +479,9 @@ define noundef i64 @udiv64_3(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0xaaaaaaaa
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, s6, v[2:3]
 ; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 1
@@ -543,11 +513,9 @@ define noundef i64 @udiv64_3(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0xaaaaaaab, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0xaaaaaaaa, v1, v[2:3]
 ; GFX1030-NEXT:    v_alignbit_b32 v0, v1, v0, 1
@@ -567,11 +535,9 @@ define noundef i64 @udiv64_6(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0xaaaaaaaa
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, s6, v[2:3]
 ; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 2
@@ -603,11 +569,9 @@ define noundef i64 @udiv64_6(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0xaaaaaaab, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0xaaaaaaaa, v1, v[2:3]
 ; GFX1030-NEXT:    v_alignbit_b32 v0, v1, v0, 2
@@ -1005,8 +969,7 @@ define noundef i64 @udiv64_i32min(i64 noundef %i)  {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, 1, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -1033,8 +996,7 @@ define noundef i64 @udiv64_i32min(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_alignbit_b32 v0, v1, v0, 31
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 31, v1
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 1, v[1:2]
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1030-NEXT:    v_add_co_u32 v0, s4, v1, v0
+; GFX1030-NEXT:    v_add_co_u32 v0, s4, v1, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s4
 ; GFX1030-NEXT:    v_alignbit_b32 v0, v1, v0, 1
 ; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
@@ -1049,19 +1011,17 @@ define noundef i64 @srem64_i32max(i64 noundef %i)  {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, 3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, 3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, 3, v[2:3]
+; GFX9-NEXT:    v_lshl_add_u32 v2, v8, 31, v8
+; GFX9-NEXT:    v_add3_u32 v5, v5, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    s_mov_b32 s6, 0x80000001
-; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
-; GFX9-NEXT:    v_lshl_add_u32 v8, v6, 31, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT:    v_add3_u32 v7, v7, v8, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
 ; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v1
@@ -1112,10 +1072,9 @@ define noundef i64 @srem64_i32max(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: srem64_i32max:
@@ -1123,23 +1082,21 @@ define noundef i64 @srem64_i32max(i64 noundef %i)  {
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, v0, 3
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1030-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX1030-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v8, 3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v6, 3, 0
-; GFX1030-NEXT:    v_lshl_add_u32 v6, v6, 31, v6
-; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v6, v2
-; GFX1030-NEXT:    v_mov_b32_e32 v4, v5
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
-; GFX1030-NEXT:    v_add_co_u32 v4, s4, v7, v4
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, 0, s4
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v3, v1
-; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v6, v0
-; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo
+; GFX1030-NEXT:    v_lshl_add_u32 v8, v8, 31, v8
+; GFX1030-NEXT:    v_add3_u32 v7, v7, v8, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v0, -1, v[6:7]
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x80000001, v0, v[2:3]
+; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v7, v1
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
+; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v0
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x80000001, v1, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
+; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 1, v[2:3]
 ; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v1, v3
 ; GFX1030-NEXT:    v_ashrrev_i64 v[4:5], 30, v[2:3]
@@ -1161,19 +1118,17 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i)  {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, 3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, 3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, 3, v[2:3]
+; GFX9-NEXT:    v_lshl_add_u32 v2, v8, 31, v8
+; GFX9-NEXT:    v_add3_u32 v5, v5, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    s_mov_b32 s6, 0x80000001
-; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
-; GFX9-NEXT:    v_lshl_add_u32 v8, v6, 31, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT:    v_add3_u32 v7, v7, v8, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
 ; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v1
@@ -1222,23 +1177,21 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i)  {
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, v0, 3
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1030-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX1030-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v8, 3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v7, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v6, 3, 0
-; GFX1030-NEXT:    v_lshl_add_u32 v6, v6, 31, v6
-; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v6, v2
-; GFX1030-NEXT:    v_mov_b32_e32 v4, v5
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
-; GFX1030-NEXT:    v_add_co_u32 v4, s4, v7, v4
-; GFX1030-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, 0, s4
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v3, v1
-; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v6, v0
-; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo
+; GFX1030-NEXT:    v_lshl_add_u32 v8, v8, 31, v8
+; GFX1030-NEXT:    v_add3_u32 v7, v7, v8, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v0, -1, v[6:7]
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x80000001, v0, v[2:3]
+; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v7, v1
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
+; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
+; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v0
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0x80000001, v1, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
+; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 1, v[2:3]
 ; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v1, v3
 ; GFX1030-NEXT:    v_ashrrev_i64 v[0:1], 30, v[2:3]
@@ -1259,11 +1212,9 @@ define noundef i64 @urem64_i32max(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_brev_b32 s6, -2
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, 5, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, 2, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
@@ -1303,10 +1254,9 @@ define noundef i64 @urem64_i32max(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
 ; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: urem64_i32max:
@@ -1315,11 +1265,9 @@ define noundef i64 @urem64_i32max(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, v0, 5
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, 5, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 2, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 2, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v1, 2, v[2:3]
 ; GFX1030-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
@@ -1346,11 +1294,9 @@ define noundef i64 @udiv64_i32max(i64 noundef %i)  {
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, 5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, 5, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, 2, v[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, 2, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
@@ -1390,11 +1336,9 @@ define noundef i64 @udiv64_i32max(i64 noundef %i)  {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, v0, 5
 ; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, 5, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v6, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v5, v3
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 2, v[4:5]
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v3
-; GFX1030-NEXT:    v_add_co_u32 v2, s4, v6, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v2, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, 2, v[2:3]
+; GFX1030-NEXT:    v_add_co_u32 v2, s4, v5, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v1, 2, v[2:3]
 ; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 8d65fa053eaa49b..41999b249a0e890 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1954,65 +1954,62 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_mul_lo_u32 v12, v33, v3
 ; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v24, v27, v2
-; SDAG-NEXT:    v_mul_lo_u32 v35, v35, v31
-; SDAG-NEXT:    v_mul_lo_u32 v38, v32, v30
+; SDAG-NEXT:    v_mul_lo_u32 v25, v35, v31
+; SDAG-NEXT:    v_mul_lo_u32 v35, v32, v30
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, 0
-; SDAG-NEXT:    v_mul_lo_u32 v25, v14, v7
+; SDAG-NEXT:    v_mul_lo_u32 v38, v14, v7
 ; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v14, v6, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v39, v15, v6
-; SDAG-NEXT:    v_mul_lo_u32 v19, v19, v37
-; SDAG-NEXT:    v_mul_lo_u32 v48, v18, v36
+; SDAG-NEXT:    v_mul_lo_u32 v48, v19, v37
+; SDAG-NEXT:    v_mul_lo_u32 v49, v18, v36
 ; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0
 ; SDAG-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v12, v3
 ; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[12:13]
-; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, v16, v2
-; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v21, v25
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v16, v2
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v21, v38
 ; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v24
-; SDAG-NEXT:    v_mov_b32_e32 v24, v23
-; SDAG-NEXT:    v_mov_b32_e32 v23, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[22:23]
-; SDAG-NEXT:    v_xor_b32_e32 v33, v12, v28
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v16, v39
-; SDAG-NEXT:    v_mov_b32_e32 v12, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v36, v14, v[12:13]
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[12:13]
+; SDAG-NEXT:    v_xor_b32_e32 v24, v16, v28
+; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v19, v39
 ; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11]
-; SDAG-NEXT:    v_add_i32_e64 v24, s[4:5], v24, v3
-; SDAG-NEXT:    v_addc_u32_e64 v25, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v17, v2, vcc
+; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], v23, v3
+; SDAG-NEXT:    v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v31, vcc, v17, v2, vcc
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
-; SDAG-NEXT:    v_mov_b32_e32 v14, v23
-; SDAG-NEXT:    v_mov_b32_e32 v23, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v37, v15, v[22:23]
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v35, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v30, v27, v[24:25]
-; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v29
-; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v19, v3
-; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; SDAG-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v18, v12
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v38, v11
+; SDAG-NEXT:    v_mov_b32_e32 v12, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v36, v14, v[12:13]
+; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], v25, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v30, v27, v[22:23]
+; SDAG-NEXT:    v_xor_b32_e32 v14, v31, v29
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v48, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v36, v15, v[13:14]
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v16, v10
-; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], v17, v19, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v12, v16
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v37, v15, v[12:13]
+; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], v35, v7
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v49, v3
+; SDAG-NEXT:    v_add_i32_e64 v12, s[4:5], v17, v12
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
+; SDAG-NEXT:    v_addc_u32_e64 v7, s[4:5], v19, v7, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v10, vcc
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v2
-; SDAG-NEXT:    v_addc_u32_e64 v11, s[4:5], v12, v3, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v13, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v2, v0, v28
+; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v36, v15, v[12:13]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v7, v0, v28
+; SDAG-NEXT:    v_add_i32_e32 v10, vcc, v12, v2
+; SDAG-NEXT:    v_addc_u32_e32 v12, vcc, v13, v3, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v3, v1, v29
-; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v33, v28
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v7, v29, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v28, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v24, v28
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v14, v29, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v7, v28, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v29, vcc
 ; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v18, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v11, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v6, v6, v26
 ; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v10, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v34
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v11, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v12, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v8, v4, v26
 ; SDAG-NEXT:    v_xor_b32_e32 v9, v5, v34
 ; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v6, v26
@@ -2827,49 +2824,43 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_mul_lo_u32 v34, v18, v15
 ; SDAG-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v35, v19, v14
-; SDAG-NEXT:    v_mul_lo_u32 v23, v23, v12
-; SDAG-NEXT:    v_mul_lo_u32 v36, v22, v13
+; SDAG-NEXT:    v_mul_lo_u32 v36, v23, v12
+; SDAG-NEXT:    v_mul_lo_u32 v37, v22, v13
 ; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0
 ; SDAG-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v11
 ; SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21]
 ; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], v25, v34
+; SDAG-NEXT:    v_add_i32_e64 v23, s[4:5], v25, v34
 ; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v17, v28
-; SDAG-NEXT:    v_mov_b32_e32 v28, v27
-; SDAG-NEXT:    v_mov_b32_e32 v27, v21
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27]
-; SDAG-NEXT:    v_add_i32_e64 v25, s[4:5], v20, v35
+; SDAG-NEXT:    v_mov_b32_e32 v20, v26
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21]
+; SDAG-NEXT:    v_add_i32_e64 v25, s[4:5], v23, v35
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v31, v8, v[16:17]
+; SDAG-NEXT:    v_add_i32_e64 v26, s[4:5], v27, v11
+; SDAG-NEXT:    v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v10, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25]
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v15
-; SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v13, v18, v[20:21]
-; SDAG-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17]
-; SDAG-NEXT:    v_mov_b32_e32 v8, v11
-; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v28, v8
+; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v13, v18, v[20:21]
+; SDAG-NEXT:    v_add_i32_e64 v15, s[4:5], v29, v17
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27]
+; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v36, v11
+; SDAG-NEXT:    v_mov_b32_e32 v20, v22
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v12, v19, v[20:21]
+; SDAG-NEXT:    v_add_i32_e64 v15, s[4:5], v33, v15
+; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], v37, v17
+; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v23, v12
 ; SDAG-NEXT:    v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v8, v10
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25]
-; SDAG-NEXT:    v_mov_b32_e32 v22, v27
-; SDAG-NEXT:    v_mov_b32_e32 v27, v21
-; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v19, v[26:27]
-; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v29, v16
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18]
-; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v23, v11
-; SDAG-NEXT:    v_mov_b32_e32 v11, v21
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v22, v11
-; SDAG-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v33, v16
-; SDAG-NEXT:    v_add_i32_e64 v17, s[4:5], v36, v17
-; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v13, v19, v[11:12]
-; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
-; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5]
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v16
+; SDAG-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v15, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v8, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; SDAG-NEXT:    v_add_i32_e32 v8, vcc, v11, v10
-; SDAG-NEXT:    v_addc_u32_e32 v9, vcc, v12, v17, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v19, v[17:18]
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; SDAG-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; SDAG-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
 ; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v14
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v10, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v11, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v6, v8, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v9, vcc
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 3a7f3e41002d282..3e6b812c12d7f63 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -61,32 +61,31 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_mul_lo_u32 v12, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; SDAG-NEXT:    v_mul_lo_u32 v6, v10, v6
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3]
-; SDAG-NEXT:    v_mul_lo_u32 v10, v9, v7
-; SDAG-NEXT:    v_add3_u32 v5, v5, v6, v12
-; SDAG-NEXT:    v_mov_b32_e32 v6, v2
-; SDAG-NEXT:    v_mov_b32_e32 v2, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2]
-; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v6, v2
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
+; SDAG-NEXT:    v_mul_lo_u32 v14, v10, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    v_add3_u32 v5, v5, v14, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5]
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v6, v9, v11
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3]
 ; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    v_add3_u32 v4, v9, v4, v10
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v3
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
+; SDAG-NEXT:    v_add3_u32 v5, v9, v5, v6
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v2, v4
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:  .LBB0_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
@@ -103,10 +102,9 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2]
-; SDAG-NEXT:    v_mov_b32_e32 v7, v4
-; SDAG-NEXT:    v_mov_b32_e32 v4, v2
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4]
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2]
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v4, v2
 ; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3]
@@ -429,32 +427,31 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
 ; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_mul_lo_u32 v12, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; SDAG-NEXT:    v_mul_lo_u32 v6, v10, v6
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3]
-; SDAG-NEXT:    v_mul_lo_u32 v10, v9, v7
-; SDAG-NEXT:    v_add3_u32 v5, v5, v6, v12
-; SDAG-NEXT:    v_mov_b32_e32 v6, v2
-; SDAG-NEXT:    v_mov_b32_e32 v2, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2]
-; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v6, v2
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
+; SDAG-NEXT:    v_mul_lo_u32 v14, v10, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    v_add3_u32 v5, v5, v14, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5]
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v6, v9, v11
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3]
 ; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    v_add3_u32 v4, v9, v4, v10
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v3
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
+; SDAG-NEXT:    v_add3_u32 v5, v9, v5, v6
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v2, v4
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5]
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:  .LBB1_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
@@ -471,10 +468,9 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2]
-; SDAG-NEXT:    v_mov_b32_e32 v7, v4
-; SDAG-NEXT:    v_mov_b32_e32 v4, v2
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4]
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2]
+; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v4, v2
 ; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3]
@@ -743,22 +739,22 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG:       ; %bb.0: ; %fp-to-i-entry
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, v0
-; SDAG-NEXT:    v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT:    v_bfe_u32 v6, v4, 23, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v6
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
-; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
-; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_mov_b32 s7, -1
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -774,66 +770,65 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_add_co_u32_e64 v9, s[4:5], -1, v0
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0x95
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
-; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT:    v_or_b32_e32 v4, 0x800000, v0
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_4
 ; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v5
-; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v5
-; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff6a, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
-; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
 ; SDAG-NEXT:    v_mul_lo_u32 v14, v10, v3
-; SDAG-NEXT:    v_mov_b32_e32 v6, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, v5
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
 ; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
 ; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v6, v5
+; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v7, v5
 ; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v11
-; SDAG-NEXT:    v_mul_lo_u32 v7, v9, v12
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v12
 ; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
 ; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    ; implicit-def: $vgpr9
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT:    v_add3_u32 v3, v9, v2, v3
 ; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
 ; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:  .LBB2_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_6
 ; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v4, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2]
@@ -949,8 +944,8 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
-; GISEL-NEXT:    v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v10, v1, v2, 1
 ; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x96
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -968,7 +963,7 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
@@ -977,24 +972,24 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v10, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
 ; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v10, v[1:2]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v10, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr10
 ; GISEL-NEXT:  .LBB2_4: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_6
@@ -1005,9 +1000,9 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v9, 0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v9
 ; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
 ; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:  .LBB2_6: ; %Flow1
@@ -1098,22 +1093,22 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG:       ; %bb.0: ; %fp-to-i-entry
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, v0
-; SDAG-NEXT:    v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT:    v_bfe_u32 v6, v4, 23, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v6
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB3_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
-; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
-; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_mov_b32 s7, -1
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -1129,66 +1124,65 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    v_add_co_u32_e64 v9, s[4:5], -1, v0
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0x95
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
-; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT:    v_or_b32_e32 v4, 0x800000, v0
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
 ; SDAG-NEXT:    s_cbranch_execz .LBB3_4
 ; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v5
-; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v5
-; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff6a, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
-; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
 ; SDAG-NEXT:    v_mul_lo_u32 v14, v10, v3
-; SDAG-NEXT:    v_mov_b32_e32 v6, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, v5
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
 ; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
 ; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v6, v5
+; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v7, v5
 ; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v11
-; SDAG-NEXT:    v_mul_lo_u32 v7, v9, v12
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v12
 ; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
 ; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    ; implicit-def: $vgpr9
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT:    v_add3_u32 v3, v9, v2, v3
 ; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
 ; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
 ; SDAG-NEXT:  .LBB3_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
 ; SDAG-NEXT:    s_cbranch_execz .LBB3_6
 ; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v4, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2]
@@ -1304,8 +1298,8 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
-; GISEL-NEXT:    v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v10, v1, v2, 1
 ; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x96
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -1323,7 +1317,7 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
@@ -1332,24 +1326,24 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v10, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
 ; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v10, v[1:2]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v10, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr10
 ; GISEL-NEXT:  .LBB3_4: ; %Flow
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_6
@@ -1360,9 +1354,9 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v9, 0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v9
 ; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
 ; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:  .LBB3_6: ; %Flow1
@@ -1481,22 +1475,22 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG:       ; %bb.0: ; %fp-to-i-entry
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, v0
-; SDAG-NEXT:    v_bfe_u32 v5, v4, 7, 8
+; SDAG-NEXT:    v_bfe_u32 v6, v4, 7, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v6
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB6_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
-; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
-; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_mov_b32 s7, -1
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -1511,66 +1505,65 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7f
 ; SDAG-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0x85
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
-; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, -1, 0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, 0x80, v0
+; SDAG-NEXT:    v_or_b32_e32 v4, 0x80, v0
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
 ; SDAG-NEXT:    s_cbranch_execz .LBB6_4
 ; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT:    v_sub_u32_e32 v0, 0xc6, v5
-; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v5
-; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff7a, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
-; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_add_co_u32_e64 v10, s[4:5], -1, v0
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xc6, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xffffff7a, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v13, v9, v2
 ; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v3
-; SDAG-NEXT:    v_mov_b32_e32 v6, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v8, v[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v8, v5
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; SDAG-NEXT:    v_add_co_u32_e64 v6, s[4:5], -1, v10
 ; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_mul_lo_u32 v3, v6, v11
-; SDAG-NEXT:    v_mul_lo_u32 v7, v6, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v7, v5
 ; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v3, v10, v11
+; SDAG-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_add3_u32 v3, v8, v2, v3
 ; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
 ; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr8
 ; SDAG-NEXT:  .LBB6_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
 ; SDAG-NEXT:    s_cbranch_execz .LBB6_6
 ; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; SDAG-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; SDAG-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
@@ -1830,22 +1823,22 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG:       ; %bb.0: ; %fp-to-i-entry
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, v0
-; SDAG-NEXT:    v_bfe_u32 v5, v4, 7, 8
+; SDAG-NEXT:    v_bfe_u32 v6, v4, 7, 8
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7e
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v6
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB7_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
-; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
-; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_mov_b32 s7, -1
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
@@ -1860,66 +1853,65 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    s_movk_i32 s4, 0x7f
 ; SDAG-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0x85
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
-; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, -1, 0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, 0x80, v0
+; SDAG-NEXT:    v_or_b32_e32 v4, 0x80, v0
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
 ; SDAG-NEXT:    s_cbranch_execz .LBB7_4
 ; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT:    v_sub_u32_e32 v0, 0xc6, v5
-; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v5
-; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff7a, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
-; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_add_co_u32_e64 v10, s[4:5], -1, v0
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xc6, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xffffff7a, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
 ; SDAG-NEXT:    v_mul_lo_u32 v13, v9, v2
 ; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v3
-; SDAG-NEXT:    v_mov_b32_e32 v6, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v8, v[4:5]
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v8, v5
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mov_b32_e32 v4, v6
 ; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; SDAG-NEXT:    v_add_co_u32_e64 v6, s[4:5], -1, v10
 ; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_mul_lo_u32 v3, v6, v11
-; SDAG-NEXT:    v_mul_lo_u32 v7, v6, v12
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v7, v5
 ; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mul_lo_u32 v3, v10, v11
+; SDAG-NEXT:    v_mul_lo_u32 v8, v10, v12
 ; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_add3_u32 v3, v8, v2, v3
 ; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
 ; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr8
 ; SDAG-NEXT:  .LBB7_4: ; %Flow
 ; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
 ; SDAG-NEXT:    s_cbranch_execz .LBB7_6
 ; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; SDAG-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; SDAG-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 3d3e8bea7e33ef4..051a0c51b086720 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -155,61 +155,61 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ;
 ; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
 ; VI-SAFE-SDAG:       ; %bb.0:
-; VI-SAFE-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
-; VI-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0xf000
-; VI-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SAFE-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s6, -1
 ; VI-SAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SAFE-SDAG-NEXT:    s_mov_b32 s0, s4
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 8
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s6
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-SAFE-SDAG-NEXT:    s_mov_b32 s1, s5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s7, 0xb0014
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s8, s4
-; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s5
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s4, s0
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s0, s3, 8
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s0, 0xffe
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s0, s3, 0x1ff
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s0, s0, s2
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s5, s1
+; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s1, s3, 0xb0014
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s2, s8, s0
+; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s1
 ; VI-SAFE-SDAG-NEXT:    v_med3_i32 v0, s8, 0, 13
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, 0x1000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s0, s2, 0x1000
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s8, s4, s8
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s8, s0, s8
 ; VI-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v0, s8
-; VI-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; VI-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v0
 ; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SAFE-SDAG-NEXT:    s_add_i32 s10, s5, 0xfffffc10
-; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s5, s10, 12
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
+; VI-SAFE-SDAG-NEXT:    s_add_i32 s10, s1, 0xfffffc10
+; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s1, s10, 12
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s0, s8, s0
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s1, s2, s1
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s11, s4, s5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s11, s0, s1
 ; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s11, 7
 ; VI-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s8, 5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s8, 3
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-SAFE-SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; VI-SAFE-SDAG-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s8, s11, 2
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; VI-SAFE-SDAG-NEXT:    s_addc_u32 s4, s8, 0
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; VI-SAFE-SDAG-NEXT:    s_addc_u32 s0, s8, 0
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, s4, 0x7c00
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, s0, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; VI-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
 ; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
 ; VI-SAFE-SDAG-NEXT:    v_mov_b32_e32 v1, s8
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 16
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s0, s3, 16
 ; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
-; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s4, v0
-; VI-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s0, s0, 0x8000
+; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-SAFE-SDAG-NEXT:    s_endpgm
 ;
 ; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index b443e654350c5ea..cd85c301e16d5b3 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -151,20 +151,20 @@ entry:
 define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MulMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -174,8 +174,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot2_MulMul:
@@ -1698,20 +1698,20 @@ entry:
 define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MultipleUses_add1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1719,10 +1719,10 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
+; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot2_MultipleUses_add1:
@@ -1851,20 +1851,20 @@ entry:
 define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MultipleUses_add1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
@@ -1872,10 +1872,10 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s0
+; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
 ; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v1, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot2_MultipleUses_add1:
@@ -2004,20 +2004,20 @@ entry:
 define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MultipleUses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -2025,10 +2025,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s0
+; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s4
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot2_MultipleUses_mul1:
@@ -2163,20 +2163,20 @@ entry:
 define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MultipleUses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
@@ -2184,10 +2184,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_i32_i24 v4, v3, v1, s0
+; GFX7-NEXT:    v_mad_i32_i24 v4, v3, v1, s4
 ; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
 ; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot2_MultipleUses_mul1:
@@ -2322,31 +2322,31 @@ entry:
 define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MultipleUses_mul2:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s0
+; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot2_MultipleUses_mul2:
@@ -2479,20 +2479,20 @@ entry:
 define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MultipleUses_mul2:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s10, 0
-; GFX7-NEXT:    s_mov_b32 s11, s7
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, s3
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
@@ -2500,10 +2500,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_i32_i24 v4, v0, v2, s0
+; GFX7-NEXT:    v_mad_i32_i24 v4, v0, v2, s4
 ; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
 ; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot2_MultipleUses_mul2:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 10fac09ef4ec071..3a97724d81fbe8d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -4906,19 +4906,19 @@ entry:
 define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32_hilo:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, s3
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, s7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -4933,7 +4933,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
 ; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot4_acc32_hilo:
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index c0de009e935e64f..c0c0d3ded117d55 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5168,38 +5168,34 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v4, v2
 ; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v7, v4, v2
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v7, v0
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v4, v2
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v4, v2
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1]
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v3, v4, v2
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v0, v5, vcc_lo
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2]
 ; GFX1200-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v6, 1
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1]
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v3, v4
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v7, vcc_lo
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v3, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v0, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v7, 1
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v7, v4
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1]
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v2, v5, v7
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v0, v4, v[1:2]
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v5, v7
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2]
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v5, v8
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v4, v[2:3]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v5, v3, v[0:1]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v5, v8
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v8, v[2:3]
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6041,76 +6037,72 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX1200-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v9, s0, v2, 1
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v8, v4
-; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v10, v6
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v9, v6
+; GFX1200-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e64 v11, vcc_lo, 0, v3, s0
 ; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v13, v10, v6
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v13, v9, v6
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v8, v5, v[0:1]
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v9, v4, v[2:3]
-; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v14, v1
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v12, v8
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v15, s0, v13, v9
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v9, v7, v[1:2]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v10, v4, v[2:3]
 ; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v11, v6, v[0:1]
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v12, v8
-; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v14, v9, vcc_lo
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v13, v10
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v3, v4
-; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v8, v2
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v15, v3, v4
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v10, v6
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v16, v10, v6
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v14, v4
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v3, v15, v6
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v8, v11, vcc_lo
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v12, 1
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v3, v5, v[0:1]
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v1, v10, vcc_lo
+; GFX1200-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e64 v11, vcc_lo, v2, v11, s0
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[8:9], null, v14, v5, v[0:1]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[9:10], null, v15, v7, v[3:4]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v10, v15, v6
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v7, v14, v4
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v16, v4, v[8:9]
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v12, 1
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[4:5], null, v11, v6, v[9:10]
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v14, vcc_lo
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[0:1], null, v10, v7, v[1:2]
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v15, v12
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[2:3], null, v9, v4, v[2:3]
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX1200-GISEL-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v11, v6, v[0:1]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[4:5], null, v15, v5, v[1:2]
-; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v16, v9
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v14, v16, v9
-; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v11, v3
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v8, v15, v12
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v15, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v7, v8
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v10, v9
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v7, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX1200-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v16, 1
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v3, vcc_lo
+; GFX1200-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v10, 1
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[5:6], null, v7, v5, v[0:1]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v10, v2, v[1:2]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v2, v14, v15
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v1, v8, v10
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[5:6], null, v16, v7, v[0:1]
-; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v14, v15
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[6:7], null, v2, v12, v[4:5]
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v8, v13, v[1:2]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v9, v[5:6]
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[4:5], null, v14, v11, v[0:1]
-; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v10, v[1:2]
-; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v8, v10
+; GFX1200-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v4, vcc_lo
+; GFX1200-GISEL-NEXT:    v_mul_hi_u32 v0, v11, v12
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[5:6], null, v3, v8, v[5:6]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v4, v9, v[1:2]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[6:7], null, v11, v13, v[0:1]
+; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v0, v11, v12
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[7:8], null, v14, v10, v[2:3]
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[1:2], null, v5, v12, v[6:7]
 ; GFX1200-GISEL-NEXT:    v_mul_lo_u32 v2, v14, v15
-; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5]
+; GFX1200-GISEL-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v15, v[7:8]
 ; GFX1200-GISEL-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 94c2e518a9fd356..8f7456b788f81b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -32,13 +32,11 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-LABEL: umulo_i64_v_v:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v5, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v0, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v2, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v1, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v9, v7
@@ -54,18 +52,16 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-LABEL: umulo_i64_v_v:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v5, v2, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v5, v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    v_add3_u32 v1, v1, v6, v8
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v0, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v0, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[7:8], s4, v4, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v4, v3, 0
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v1, v5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v7
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
@@ -78,23 +74,22 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v5, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v5, v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    v_add3_u32 v1, v1, v6, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v1, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v1, v1, v6, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -106,23 +101,21 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, v4, v3, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v5, v2, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v5, v3, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v1
-; GFX12-NEXT:    v_add3_u32 v1, v1, v6, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v0, v3, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v4, v2, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v4, v3, 0
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, v1, v5
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
-; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_add3_u32 v1, v1, v5, v7
+; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v7
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
@@ -181,11 +174,10 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v10, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v1, v6
 ; GFX9-NEXT:    v_mad_i64_i32 v[10:11], s[4:5], v4, v3, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v8
@@ -215,14 +207,13 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v5, v2, 0
 ; GFX10-NEXT:    v_mad_i64_i32 v[10:11], s4, v5, v3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v12, v1
-; GFX10-NEXT:    v_add3_u32 v1, v1, v6, v8
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v12, v6
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v1, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v1, v6, v8
 ; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v12, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo
@@ -249,38 +240,37 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v5, v2, 0
 ; GFX11-NEXT:    v_mad_i64_i32 v[10:11], null, v5, v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v12, v1
-; GFX11-NEXT:    v_add3_u32 v1, v1, v6, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v12, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v1, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v1, v1, v6, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v12, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v10
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v7, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -294,45 +284,44 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, v4, v3, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v5, v2, 0
 ; GFX12-NEXT:    v_mad_co_i64_i32 v[10:11], null, v5, v3, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v12, v1
-; GFX12-NEXT:    v_add3_u32 v1, v1, v6, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v12, vcc_lo, v12, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add_co_u32 v12, vcc_lo, v1, v6
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v1, v1, v6, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_add_co_u32 v12, vcc_lo, v12, v8
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v10
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v7, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
 ; GFX12-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
 ; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
 ; GFX12-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index fbda0e71a74c6ed..1fc7349882ba13a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9465,47 +9465,47 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b64 s[12:13], s[2:3], 0x0
+; GFX12-NEXT:    s_load_b64 s[10:11], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s96, s13, 30
-; GFX12-NEXT:    s_lshr_b32 s98, s13, 31
-; GFX12-NEXT:    s_lshr_b32 s92, s13, 28
-; GFX12-NEXT:    s_lshr_b32 s94, s13, 29
-; GFX12-NEXT:    s_lshr_b32 s78, s13, 26
-; GFX12-NEXT:    s_lshr_b32 s88, s13, 27
+; GFX12-NEXT:    s_lshr_b32 s96, s11, 30
+; GFX12-NEXT:    s_lshr_b32 s98, s11, 31
+; GFX12-NEXT:    s_lshr_b32 s92, s11, 28
+; GFX12-NEXT:    s_lshr_b32 s94, s11, 29
+; GFX12-NEXT:    s_lshr_b32 s78, s11, 26
+; GFX12-NEXT:    s_lshr_b32 s88, s11, 27
 ; GFX12-NEXT:    s_bfe_i64 s[96:97], s[96:97], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[100:101], s[98:99], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s66, s13, 24
-; GFX12-NEXT:    s_lshr_b32 s74, s13, 25
+; GFX12-NEXT:    s_lshr_b32 s66, s11, 24
+; GFX12-NEXT:    s_lshr_b32 s74, s11, 25
 ; GFX12-NEXT:    s_bfe_i64 s[92:93], s[92:93], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[94:95], s[94:95], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96
-; GFX12-NEXT:    s_lshr_b32 s56, s13, 22
-; GFX12-NEXT:    s_lshr_b32 s62, s13, 23
+; GFX12-NEXT:    s_lshr_b32 s56, s11, 22
+; GFX12-NEXT:    s_lshr_b32 s62, s11, 23
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92
 ; GFX12-NEXT:    s_bfe_i64 s[78:79], s[78:79], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[88:89], s[88:89], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s44, s13, 20
-; GFX12-NEXT:    s_lshr_b32 s52, s13, 21
-; GFX12-NEXT:    s_lshr_b32 s30, s13, 18
-; GFX12-NEXT:    s_lshr_b32 s40, s13, 19
-; GFX12-NEXT:    s_lshr_b32 s18, s13, 16
-; GFX12-NEXT:    s_lshr_b32 s26, s13, 17
-; GFX12-NEXT:    s_lshr_b32 s2, s13, 14
-; GFX12-NEXT:    s_lshr_b32 s4, s13, 15
+; GFX12-NEXT:    s_lshr_b32 s44, s11, 20
+; GFX12-NEXT:    s_lshr_b32 s52, s11, 21
+; GFX12-NEXT:    s_lshr_b32 s30, s11, 18
+; GFX12-NEXT:    s_lshr_b32 s40, s11, 19
+; GFX12-NEXT:    s_lshr_b32 s18, s11, 16
+; GFX12-NEXT:    s_lshr_b32 s26, s11, 17
+; GFX12-NEXT:    s_lshr_b32 s2, s11, 14
+; GFX12-NEXT:    s_lshr_b32 s4, s11, 15
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78
 ; GFX12-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[74:75], s[74:75], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s6, s13, 12
-; GFX12-NEXT:    s_lshr_b32 s8, s13, 13
+; GFX12-NEXT:    s_lshr_b32 s6, s11, 12
+; GFX12-NEXT:    s_lshr_b32 s8, s11, 13
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66
 ; GFX12-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s10, s13, 10
-; GFX12-NEXT:    s_lshr_b32 s14, s13, 11
+; GFX12-NEXT:    s_lshr_b32 s12, s11, 10
+; GFX12-NEXT:    s_lshr_b32 s14, s11, 11
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74
 ; GFX12-NEXT:    v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56
 ; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
@@ -9516,14 +9516,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s16, s13, 8
-; GFX12-NEXT:    s_lshr_b32 s20, s13, 9
+; GFX12-NEXT:    s_lshr_b32 s16, s11, 8
+; GFX12-NEXT:    s_lshr_b32 s20, s11, 9
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62
 ; GFX12-NEXT:    v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44
 ; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s22, s13, 6
-; GFX12-NEXT:    s_lshr_b32 s24, s13, 7
+; GFX12-NEXT:    s_lshr_b32 s22, s11, 6
+; GFX12-NEXT:    s_lshr_b32 s24, s11, 7
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52
 ; GFX12-NEXT:    v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30
 ; GFX12-NEXT:    v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40
@@ -9531,7 +9531,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s26
 ; GFX12-NEXT:    v_mov_b32_e32 v32, s27
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
 ; GFX12-NEXT:    s_clause 0x7
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:496
 ; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:480
@@ -9544,36 +9544,36 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, s6
-; GFX12-NEXT:    s_lshr_b32 s28, s13, 4
-; GFX12-NEXT:    s_lshr_b32 s34, s13, 5
-; GFX12-NEXT:    s_lshr_b32 s36, s13, 2
-; GFX12-NEXT:    s_lshr_b32 s38, s13, 3
+; GFX12-NEXT:    s_lshr_b32 s28, s11, 4
+; GFX12-NEXT:    s_lshr_b32 s34, s11, 5
+; GFX12-NEXT:    s_lshr_b32 s36, s11, 2
+; GFX12-NEXT:    s_lshr_b32 s38, s11, 3
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX12-NEXT:    s_lshr_b32 s42, s13, 1
-; GFX12-NEXT:    s_mov_b32 s46, s13
+; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12
+; GFX12-NEXT:    s_lshr_b32 s42, s11, 1
+; GFX12-NEXT:    s_mov_b32 s46, s11
 ; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s14
+; GFX12-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX12-NEXT:    s_lshr_b32 s48, s12, 30
-; GFX12-NEXT:    s_lshr_b32 s50, s12, 31
+; GFX12-NEXT:    s_lshr_b32 s48, s10, 30
+; GFX12-NEXT:    s_lshr_b32 s50, s10, 31
 ; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20
 ; GFX12-NEXT:    v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22
-; GFX12-NEXT:    s_lshr_b32 s54, s12, 28
-; GFX12-NEXT:    s_lshr_b32 s58, s12, 29
+; GFX12-NEXT:    s_lshr_b32 s54, s10, 28
+; GFX12-NEXT:    s_lshr_b32 s58, s10, 29
 ; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24
 ; GFX12-NEXT:    v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28
-; GFX12-NEXT:    s_lshr_b32 s60, s12, 26
-; GFX12-NEXT:    s_lshr_b32 s64, s12, 27
+; GFX12-NEXT:    s_lshr_b32 s60, s10, 26
+; GFX12-NEXT:    s_lshr_b32 s64, s10, 27
 ; GFX12-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34
@@ -9588,43 +9588,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39
 ; GFX12-NEXT:    v_mov_b32_e32 v5, s46
-; GFX12-NEXT:    s_lshr_b32 s68, s12, 24
-; GFX12-NEXT:    s_lshr_b32 s70, s12, 25
-; GFX12-NEXT:    s_lshr_b32 s72, s12, 22
-; GFX12-NEXT:    s_lshr_b32 s76, s12, 23
+; GFX12-NEXT:    s_lshr_b32 s68, s10, 24
+; GFX12-NEXT:    s_lshr_b32 s70, s10, 25
+; GFX12-NEXT:    s_lshr_b32 s72, s10, 22
+; GFX12-NEXT:    s_lshr_b32 s76, s10, 23
 ; GFX12-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48
-; GFX12-NEXT:    s_lshr_b32 s80, s12, 20
-; GFX12-NEXT:    s_lshr_b32 s82, s12, 21
+; GFX12-NEXT:    s_lshr_b32 s80, s10, 20
+; GFX12-NEXT:    s_lshr_b32 s82, s10, 21
 ; GFX12-NEXT:    s_bfe_i64 s[64:65], s[64:65], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54
-; GFX12-NEXT:    s_lshr_b32 s84, s12, 18
-; GFX12-NEXT:    s_lshr_b32 s86, s12, 19
+; GFX12-NEXT:    s_lshr_b32 s84, s10, 18
+; GFX12-NEXT:    s_lshr_b32 s86, s10, 19
 ; GFX12-NEXT:    s_bfe_i64 s[76:77], s[76:77], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[72:73], s[72:73], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[70:71], s[70:71], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[68:69], s[68:69], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58
 ; GFX12-NEXT:    v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60
-; GFX12-NEXT:    s_lshr_b32 s90, s12, 16
-; GFX12-NEXT:    s_lshr_b32 s98, s12, 17
+; GFX12-NEXT:    s_lshr_b32 s90, s10, 16
+; GFX12-NEXT:    s_lshr_b32 s98, s10, 17
 ; GFX12-NEXT:    s_bfe_i64 s[82:83], s[82:83], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[80:81], s[80:81], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64
 ; GFX12-NEXT:    v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68
-; GFX12-NEXT:    s_lshr_b32 s96, s12, 14
-; GFX12-NEXT:    s_lshr_b32 s100, s12, 15
-; GFX12-NEXT:    s_lshr_b32 s94, s12, 13
-; GFX12-NEXT:    s_lshr_b32 s88, s12, 11
-; GFX12-NEXT:    s_lshr_b32 s74, s12, 9
-; GFX12-NEXT:    s_lshr_b32 s62, s12, 7
-; GFX12-NEXT:    s_lshr_b32 s52, s12, 5
-; GFX12-NEXT:    s_lshr_b32 s40, s12, 3
-; GFX12-NEXT:    s_lshr_b32 s26, s12, 1
+; GFX12-NEXT:    s_lshr_b32 s96, s10, 14
+; GFX12-NEXT:    s_lshr_b32 s100, s10, 15
+; GFX12-NEXT:    s_lshr_b32 s94, s10, 13
+; GFX12-NEXT:    s_lshr_b32 s88, s10, 11
+; GFX12-NEXT:    s_lshr_b32 s74, s10, 9
+; GFX12-NEXT:    s_lshr_b32 s62, s10, 7
+; GFX12-NEXT:    s_lshr_b32 s52, s10, 5
+; GFX12-NEXT:    s_lshr_b32 s40, s10, 3
+; GFX12-NEXT:    s_lshr_b32 s26, s10, 1
 ; GFX12-NEXT:    s_bfe_i64 s[86:87], s[86:87], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[84:85], s[84:85], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70
@@ -9639,19 +9639,19 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77
 ; GFX12-NEXT:    v_mov_b32_e32 v5, s80
-; GFX12-NEXT:    s_lshr_b32 s92, s12, 12
-; GFX12-NEXT:    s_lshr_b32 s78, s12, 10
+; GFX12-NEXT:    s_lshr_b32 s92, s10, 12
+; GFX12-NEXT:    s_lshr_b32 s78, s10, 10
 ; GFX12-NEXT:    s_bfe_i64 s[98:99], s[98:99], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[90:91], s[90:91], 0x10000
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84
-; GFX12-NEXT:    s_lshr_b32 s66, s12, 8
-; GFX12-NEXT:    s_lshr_b32 s56, s12, 6
-; GFX12-NEXT:    s_lshr_b32 s44, s12, 4
-; GFX12-NEXT:    s_lshr_b32 s30, s12, 2
-; GFX12-NEXT:    s_bfe_i64 s[18:19], s[12:13], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s66, s10, 8
+; GFX12-NEXT:    s_lshr_b32 s56, s10, 6
+; GFX12-NEXT:    s_lshr_b32 s44, s10, 4
+; GFX12-NEXT:    s_lshr_b32 s30, s10, 2
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[10:11], 0x10000
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[26:27], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[26:27], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[26:27], s[40:41], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[40:41], s[52:53], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[52:53], s[62:63], 0x10000
@@ -9695,8 +9695,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26
 ; GFX12-NEXT:    v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18
-; GFX12-NEXT:    v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s12
-; GFX12-NEXT:    v_mov_b32_e32 v24, s13
+; GFX12-NEXT:    v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10
+; GFX12-NEXT:    v_mov_b32_e32 v24, s11
 ; GFX12-NEXT:    s_clause 0x5
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:80
 ; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 341332e60b5c0dc..4ce3b46211e64ac 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -1843,11 +1843,10 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_ashr_i32 s0, s3, 31
-; GFX7-HSA-NEXT:    s_mov_b32 s1, s3
-; GFX7-HSA-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX7-HSA-NEXT:    s_ashr_i32 s1, s2, 31
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX7-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX7-HSA-NEXT:    s_endpgm
@@ -1861,11 +1860,10 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s3, 31
-; GFX8-NOHSA-NEXT:    s_mov_b32 s1, s3
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s1, s2, 31
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -1902,8 +1900,8 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX9-HSA-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-HSA-NEXT:    s_ashr_i32 s5, s2, 31
 ; GFX9-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-HSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-HSA-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-HSA-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-HSA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 4217384cdd5ce79..8589158f11a7088 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -5933,17 +5933,17 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
@@ -5956,8 +5956,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_sextload_v4i16_to_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 5ce8a2b5f862e11..0573de4a7f2d1df 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1555,13 +1555,14 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
-; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v1
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
 ; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
@@ -1571,14 +1572,15 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, s1
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v1
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
 ; GCNX3-HSA-NEXT:    s_endpgm
 ;
 ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
@@ -1591,13 +1593,14 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
-; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
 ; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v1
-; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
 ; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCNX3-NOHSA-NEXT:    s_endpgm
 ;
@@ -1626,14 +1629,15 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCN-HSA-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
 ; GCN-HSA-NEXT:    s_endpgm
   %ld = load <2 x i32>, ptr addrspace(1) %in
   %ext = sext <2 x i32> %ld to <2 x i64>
@@ -1902,36 +1906,36 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; SI-NOHSA:       ; %bb.0:
-; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
-; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
-; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
-; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
-; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
-; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
@@ -1981,36 +1985,36 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ;
 ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; GCNX3-NOHSA:       ; %bb.0:
-; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
 ; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCNX3-NOHSA-NEXT:    s_nop 0
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
 ; GCNX3-NOHSA-NEXT:    s_nop 0
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
 ; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
-; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
 ; GCNX3-NOHSA-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v8i32_to_v8i64:
@@ -2091,17 +2095,17 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
 ; SI-NOHSA:       ; %bb.0:
-; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
-; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
-; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
 ; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
-; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
-; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
@@ -2121,10 +2125,10 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v13, v3
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v0
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:48
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:32
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16
-; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
 ; SI-NOHSA-NEXT:    s_endpgm
 ;
 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
@@ -2369,13 +2373,13 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
@@ -2402,10 +2406,10 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v9
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v8
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v8
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v9
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v13
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
@@ -2414,30 +2418,30 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v11
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v10
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v10
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v11
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v15
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v13
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v14
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v15
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v10
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v9
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v9
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v11
 ; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
 ; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[12:15]
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v5
@@ -2611,60 +2615,115 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; EG-NEXT:     MOV * T16.Z, T1.Y,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64:
-; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, 0
-; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
-; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
-; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
-; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v36, s[2:3]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, v5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, v9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v15
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1]
-; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
-; GCN-HSA-NEXT:    s_endpgm
+; GCN-GFX900-HSA-LABEL: global_sextload_v16i32_to_v16i64:
+; GCN-GFX900-HSA:       ; %bb.0:
+; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v36, 0
+; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[12:15], v36, s[2:3]
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v24, v4
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v26, v5
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v20, v6
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v22, v7
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v16, v2
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v18, v3
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v28, v8
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v30, v9
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v32, v12
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v34, v13
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, v14
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v10, v15
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1]
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GCN-GFX900-HSA-NEXT:    s_endpgm
+;
+; GCN-GFX908-HSA-LABEL: global_sextload_v16i32_to_v16i64:
+; GCN-GFX908-HSA:       ; %bb.0:
+; GCN-GFX908-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-GFX908-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3] offset:32
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3] offset:48
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[9:12], v0, s[2:3] offset:16
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[13:16], v0, s[2:3]
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v4
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v6
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v5
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v25, v5
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v27, v6
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v3
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v8
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v21, v7
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v23, v8
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v7, v2
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v17, v3
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v19, v4
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v12
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v32, 31, v10
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v30, 31, v9
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v29, v9
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v31, v10
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v3, v12
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v15
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v14
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v34, 31, v13
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v33, v13
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v35, v14
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v9, v15
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v11, v16
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[25:28], s[0:1] offset:96
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[21:24], s[0:1] offset:112
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1] offset:64
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[29:32], s[0:1] offset:32
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:48
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[33:36], s[0:1]
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v0, v[9:12], s[0:1] offset:16
+; GCN-GFX908-HSA-NEXT:    s_endpgm
   %ld = load <16 x i32>, ptr addrspace(1) %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
@@ -3137,26 +3196,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 64
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[8:9]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[12:13]
 ; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s7
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s6
 ; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s1
@@ -3223,26 +3282,26 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[24:27]
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v15
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v14
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v13
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v12
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, v12
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, v13
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, v14
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, v15
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v11
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v10
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v9
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, v9
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, v11
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v4
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v5
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
@@ -3253,58 +3312,58 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[23:26]
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v16
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s2
-; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v17
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[4:7]
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s2
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v19
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v18
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v18
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, v19
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[23:26]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[23:26]
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v9
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v8
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, v8
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, v9
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v13
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[15:18]
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s3
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v15
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v15
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v11
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, v1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v0
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v3
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v2
-; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v10
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, v3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
 ; GCNX3-HSA-NEXT:    s_endpgm
 ;
@@ -3941,13 +4000,13 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s8
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s9
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s7
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s6
 ; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v1
@@ -4032,29 +4091,29 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v19
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v12
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v13
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v9
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v14
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v15
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v8
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v9
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v13
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v10
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v15
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index b6ff99214249afc..a5f6c2fe5d26441 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -92,11 +92,11 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX940-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
@@ -523,11 +523,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
 ; GFX940-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
 ; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 11c62a7312755b1..d4f75051b04d49e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -226,19 +226,17 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; CI-NEXT:    v_mov_b32_e32 v8, 0
 ; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v1, v[7:8]
 ; CI-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
-; CI-NEXT:    v_mov_b32_e32 v11, v10
-; CI-NEXT:    v_mov_b32_e32 v10, v8
-; CI-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v13, v[9:10]
-; CI-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CI-NEXT:    v_mov_b32_e32 v7, v9
+; CI-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v13, v[7:8]
+; CI-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CI-NEXT:    v_mad_i64_i32 v[10:11], s[4:5], v1, v12, 0
 ; CI-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
 ; CI-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v13, v[8:9]
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v13, v0, v[10:11]
 ; CI-NEXT:    v_add_i32_e32 v8, vcc, v8, v0
 ; CI-NEXT:    v_addc_u32_e32 v9, vcc, v9, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v1, v7
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v6, v2
-; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, v7, v3, vcc
 ; CI-NEXT:    v_addc_u32_e32 v2, vcc, v8, v4, vcc
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
 ; CI-NEXT:    s_setpc_b64 s[30:31]
@@ -280,27 +278,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v1, v[8:9]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
-; GFX9-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v12, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
-; GFX9-NEXT:    v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[8:9]
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v14, v[10:11]
+; GFX9-NEXT:    v_mad_i64_i32 v[12:13], s[4:5], v1, v12, 0
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v11, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v5, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -312,25 +307,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX1100-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
-; GFX1100-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[7:8], null, v0, v15, v[9:10]
-; GFX1100-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1100-NEXT:    v_mad_i64_i32 v[8:9], null, v1, v14, 0
+; GFX1100-NEXT:    v_mov_b32_e32 v7, v9
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v15, v[7:8]
+; GFX1100-NEXT:    v_mad_i64_i32 v[7:8], null, v1, v14, 0
+; GFX1100-NEXT:    v_add_co_u32 v9, s0, v10, v12
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, 0, s0
+; GFX1100-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[7:8]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add_co_u32 v10, s0, v11, v10
-; GFX1100-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[8:9]
-; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[10:11]
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v12
-; GFX1100-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
+; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[9:10]
+; GFX1100-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v12
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v13, vcc_lo
 ; GFX1100-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
-; GFX1100-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
-; GFX1100-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
+; GFX1100-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v11, v3, vcc_lo
+; GFX1100-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v4, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1100-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v8, v5, vcc_lo
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1150-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -338,21 +332,20 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1150-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
 ; GFX1150-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1150-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
-; GFX1150-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mad_u64_u32 v[9:10], null, v12, v1, v[7:8]
-; GFX1150-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
+; GFX1150-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
+; GFX1150-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_mad_i64_i32 v[11:12], null, v1, v13, 0
+; GFX1150-NEXT:    v_mad_u64_u32 v[9:10], null, v13, v1, v[7:8]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mad_u64_u32 v[7:8], null, v0, v13, v[9:10]
-; GFX1150-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1150-NEXT:    v_mad_i64_i32 v[8:9], null, v1, v12, 0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_co_u32 v10, s0, v11, v10
-; GFX1150-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v13, v0, v[8:9]
-; GFX1150-NEXT:    v_mad_u64_u32 v[8:9], null, v12, v13, v[10:11]
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v9
+; GFX1150-NEXT:    v_mad_u64_u32 v[7:8], null, v0, v14, v[7:8]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v14, v0, v[11:12]
+; GFX1150-NEXT:    v_add_co_u32 v8, s0, v10, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, 0, s0
+; GFX1150-NEXT:    v_mad_u64_u32 v[8:9], null, v13, v14, v[8:9]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1150-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v0
 ; GFX1150-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
@@ -372,22 +365,21 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, v0, v1, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
-; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v12, v1, v[7:8]
-; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
+; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
+; GFX12-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_i64_i32 v[11:12], null, v1, v13, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v13, v1, v[7:8]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10]
-; GFX12-NEXT:    v_mov_b32_e32 v10, v8
-; GFX12-NEXT:    v_mad_co_i64_i32 v[8:9], null, v1, v12, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_u32 v10, s0, v11, v10
+; GFX12-NEXT:    v_mov_b32_e32 v7, v9
+; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v0, v14, v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v14, v0, v[11:12]
+; GFX12-NEXT:    v_add_co_u32 v8, s0, v10, v8
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
-; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, 0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9]
 ; GFX12-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
index fab5d386446d3ce..6f21df3a06ce745 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
@@ -28,29 +28,28 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN-NEXT:  ; %bb.1: ; %atomic
 ; GCN-NEXT:    s_mov_b32 s8, s10
 ; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400
+; GCN-NEXT:    buffer_load_dword v5, v[1:2], s[8:11], 0 addr64 offset:400
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0xf
 ; GCN-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_max_i32_e32 v3, s4, v4
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-NEXT:    v_mov_b32_e32 v5, v3
-; GCN-NEXT:    buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_max_i32_e32 v4, s4, v5
+; GCN-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, v5
+; GCN-NEXT:    buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v4, v5
+; GCN-NEXT:    v_mov_b32_e32 v5, v3
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_2
 ; GCN-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GCN-NEXT:  .LBB0_4: ; %exit
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 01eb1b1a353d12a..2003cb163a985c7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2653,41 +2653,38 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ;
 ; VI-LABEL: s_mul_i128:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4c
-; VI-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x7c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4c
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x7c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v5, 0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s10
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0
-; VI-NEXT:    s_mul_i32 s4, s12, s11
-; VI-NEXT:    v_mov_b32_e32 v6, s12
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0
-; VI-NEXT:    s_mul_i32 s6, s13, s10
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s6, v3
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
+; VI-NEXT:    s_mul_i32 s3, s8, s3
+; VI-NEXT:    v_mov_b32_e32 v6, s8
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s3, v3
+; VI-NEXT:    s_mul_i32 s12, s9, s2
+; VI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s0, v6, 0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s12, v3
 ; VI-NEXT:    v_mov_b32_e32 v4, v1
-; VI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v8, s8
-; VI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s14, v8, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, v7
-; VI-NEXT:    v_mov_b32_e32 v7, v5
-; VI-NEXT:    v_mov_b32_e32 v8, s13
-; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s8, v8, v[6:7]
-; VI-NEXT:    s_mul_i32 s6, s15, s8
-; VI-NEXT:    v_add_u32_e32 v6, vcc, s6, v2
-; VI-NEXT:    v_mov_b32_e32 v2, v5
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; VI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s9, v8, v[2:3]
-; VI-NEXT:    s_mul_i32 s6, s14, s9
-; VI-NEXT:    v_add_u32_e32 v5, vcc, s6, v6
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, v4
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], s1, v6, v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s10, v8, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:    v_mov_b32_e32 v6, s9
+; VI-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s0, v6, v[4:5]
+; VI-NEXT:    s_mul_i32 s8, s11, s0
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s8, v9
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
+; VI-NEXT:    v_addc_u32_e64 v3, s[2:3], 0, 0, vcc
+; VI-NEXT:    s_mul_i32 s8, s10, s1
+; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s1, v6, v[2:3]
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s8, v4
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_i128:
@@ -3012,52 +3009,49 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, v3, v2
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10]
-; VI-NEXT:    v_mov_b32_e32 v4, v3
-; VI-NEXT:    v_mov_b32_e32 v3, v10
-; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3]
-; VI-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15]
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; VI-NEXT:    v_addc_u32_e64 v4, s[0:1], 0, 0, vcc
-; VI-NEXT:    v_mul_lo_u32 v0, v7, v0
-; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4]
-; VI-NEXT:    v_mul_lo_u32 v1, v6, v1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v10
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v3, v9
-; VI-NEXT:    v_addc_u32_e32 v11, vcc, v4, v0, vcc
+; VI-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15]
 ; VI-NEXT:    v_mov_b32_e32 v9, v2
+; VI-NEXT:    v_mul_lo_u32 v2, v7, v0
+; VI-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10]
+; VI-NEXT:    v_mul_lo_u32 v4, v6, v1
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v2, v15
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v10
+; VI-NEXT:    v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3]
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v6
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v0, v14
+; VI-NEXT:    v_addc_u32_e32 v11, vcc, v1, v2, vcc
 ; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_mul_i128:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 4, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v12, s[0:1]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v14, s[0:1]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v14, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v2
-; GFX9-NEXT:    v_mul_lo_u32 v13, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-NEXT:    v_add3_u32 v9, v9, v13, v10
-; GFX9-NEXT:    v_mul_lo_u32 v13, v6, v1
+; GFX9-NEXT:    v_add3_u32 v9, v9, v12, v10
+; GFX9-NEXT:    v_mul_lo_u32 v15, v6, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v10, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9]
-; GFX9-NEXT:    v_mov_b32_e32 v10, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
-; GFX9-NEXT:    v_mul_lo_u32 v0, v7, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11]
-; GFX9-NEXT:    v_add3_u32 v0, v0, v9, v13
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v0, vcc
-; GFX9-NEXT:    global_store_dwordx4 v12, v[2:5], s[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11]
+; GFX9-NEXT:    v_mul_lo_u32 v10, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9]
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v13, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9]
+; GFX9-NEXT:    v_add3_u32 v5, v10, v7, v15
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
+; GFX9-NEXT:    global_store_dwordx4 v14, v[2:5], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_mul_i128:
@@ -3071,22 +3065,20 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v2
+; GFX10-NEXT:    v_mul_lo_u32 v14, v5, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v12
-; GFX10-NEXT:    v_mov_b32_e32 v12, v10
-; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v11
 ; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v3
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v10
-; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v15
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v14, v4
+; GFX10-NEXT:    v_mul_lo_u32 v4, v6, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s0, v0, v5, v[9:10]
+; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v14
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v12, v10
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s0, 0, 0, s0
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11]
-; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v12
+; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v4
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
@@ -3097,37 +3089,37 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
+; GFX11-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v15, s[0:1]
-; GFX11-NEXT:    global_load_b128 v[4:7], v15, s[2:3]
+; GFX11-NEXT:    global_load_b128 v[0:3], v17, s[0:1]
+; GFX11-NEXT:    global_load_b128 v[4:7], v17, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v0, v4, 0
-; GFX11-NEXT:    v_mul_lo_u32 v14, v5, v2
+; GFX11-NEXT:    v_mul_lo_u32 v18, v5, v2
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
-; GFX11-NEXT:    v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12]
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v4, v2, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[15:16], null, v4, v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v11, v9
+; GFX11-NEXT:    v_add3_u32 v16, v16, v3, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12]
 ; GFX11-NEXT:    v_mul_lo_u32 v4, v6, v1
-; GFX11-NEXT:    v_mov_b32_e32 v2, v10
-; GFX11-NEXT:    v_mul_lo_u32 v10, v7, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v3, v14
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v13, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v11, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12]
+; GFX11-NEXT:    v_mul_lo_u32 v12, v7, v0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v14, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16]
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
-; GFX11-NEXT:    v_add3_u32 v0, v10, v14, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v13
+; GFX11-NEXT:    v_add3_u32 v0, v12, v11, v4
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
-; GFX11-NEXT:    global_store_b128 v15, v[8:11], s[2:3]
+; GFX11-NEXT:    global_store_b128 v17, v[8:11], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_i128:
@@ -3142,29 +3134,27 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX12-NEXT:    global_load_b128 v[4:7], v13, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
-; GFX12-NEXT:    v_mul_lo_u32 v15, v5, v2
+; GFX12-NEXT:    v_mul_lo_u32 v14, v5, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10]
-; GFX12-NEXT:    v_mov_b32_e32 v14, v12
-; GFX12-NEXT:    v_mov_b32_e32 v12, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12]
+; GFX12-NEXT:    v_mov_b32_e32 v9, v11
 ; GFX12-NEXT:    v_mul_lo_u32 v11, v4, v3
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v4, v2, 0
-; GFX12-NEXT:    v_mul_lo_u32 v12, v6, v1
-; GFX12-NEXT:    v_mov_b32_e32 v4, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add3_u32 v3, v3, v11, v15
-; GFX12-NEXT:    v_add_co_u32 v10, s0, v14, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mul_lo_u32 v4, v6, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v0, v5, v[9:10]
+; GFX12-NEXT:    v_add3_u32 v3, v3, v11, v14
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v10, s0, v12, v10
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11]
-; GFX12-NEXT:    v_add3_u32 v3, v7, v3, v12
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v3, v7, v3, v4
 ; GFX12-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_store_b128 v13, v[8:11], s[2:3]
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 704947523f677c9..afe1f33d15e4226 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -203,30 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_or_b32_e32 v13, v8, v14
 ; GFX9-NEXT:  .LBB0_6: ; %Flow3
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v16, v13, v5
+; GFX9-NEXT:    v_mul_lo_u32 v18, v13, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v15, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15]
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v11, v4
-; GFX9-NEXT:    v_mul_lo_u32 v10, v10, v23
-; GFX9-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-NEXT:    v_mov_b32_e32 v14, v15
-; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v23, v11, v[13:14]
-; GFX9-NEXT:    v_add3_u32 v8, v8, v16, v9
+; GFX9-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15]
+; GFX9-NEXT:    v_add3_u32 v8, v8, v18, v9
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8]
-; GFX9-NEXT:    v_mov_b32_e32 v8, v14
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v8
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v12, v12, v22
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v22, v11, v[8:9]
-; GFX9-NEXT:    v_add3_u32 v4, v10, v7, v12
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15]
+; GFX9-NEXT:    v_mul_lo_u32 v4, v12, v22
+; GFX9-NEXT:    v_mul_lo_u32 v12, v10, v23
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v17, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10]
+; GFX9-NEXT:    v_add3_u32 v4, v12, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v9, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v4, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v20
@@ -1683,27 +1680,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_mul_lo_u32 v19, v12, v7
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v17, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0
+; GFX9-NEXT:    v_mul_lo_u32 v18, v13, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v16, v8
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17]
-; GFX9-NEXT:    v_mul_lo_u32 v18, v13, v6
-; GFX9-NEXT:    v_mul_lo_u32 v16, v15, v4
-; GFX9-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-NEXT:    v_mov_b32_e32 v12, v17
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12]
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v19, v18
 ; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10]
-; GFX9-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-NEXT:    v_mul_lo_u32 v10, v14, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v14, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15]
-; GFX9-NEXT:    v_add3_u32 v6, v16, v9, v10
+; GFX9-NEXT:    v_mov_b32_e32 v16, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17]
+; GFX9-NEXT:    v_mul_lo_u32 v6, v14, v5
+; GFX9-NEXT:    v_mul_lo_u32 v14, v15, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
+; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12]
+; GFX9-NEXT:    v_add3_u32 v6, v14, v9, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 57c54c4de710274..d06d9f97db71c99 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -1594,20 +1594,20 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: v_sdiv_i23:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s10, s6
-; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s10, s2
+; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s2
-; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    s_mov_b32 s8, s6
+; GCN-NEXT:    s_mov_b32 s9, s7
 ; GCN-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
 ; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:6
 ; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
 ; GCN-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
@@ -1632,7 +1632,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; TONGA-LABEL: v_sdiv_i23:
@@ -1783,20 +1783,20 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: v_sdiv_i24:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s10, s6
-; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s10, s2
+; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s2
-; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    s_mov_b32 s8, s6
+; GCN-NEXT:    s_mov_b32 s9, s7
 ; GCN-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:6
 ; GCN-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
 ; GCN-NEXT:    buffer_load_sbyte v2, off, s[8:11], 0 offset:2
 ; GCN-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; TONGA-LABEL: v_sdiv_i24:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index ae70abc7317c314..e2bcf3f6a2e2cd3 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9779,118 +9779,111 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:224
 ; GFX6-NEXT:    s_mov_b32 s2, 0x86a00
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:208
 ; GFX6-NEXT:    s_mov_b32 s2, 0x86600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:192
 ; GFX6-NEXT:    s_mov_b32 s2, 0x86200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:176
 ; GFX6-NEXT:    s_mov_b32 s2, 0x85e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:160
 ; GFX6-NEXT:    s_mov_b32 s2, 0x85a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:144
 ; GFX6-NEXT:    s_mov_b32 s2, 0x85600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:128
 ; GFX6-NEXT:    s_mov_b32 s2, 0x85200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:112
 ; GFX6-NEXT:    s_mov_b32 s2, 0x84e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:96
 ; GFX6-NEXT:    s_mov_b32 s2, 0x84a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:80
 ; GFX6-NEXT:    s_mov_b32 s2, 0x84600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x84200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT:    s_mov_b32 s2, 0x83a00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT:    s_mov_b32 s2, 0x83200
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT:    s_mov_b32 s2, 0x83600
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:64
+; GFX6-NEXT:    s_mov_b32 s2, 0x84200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[8:11], v[5:6], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:16
+; GFX6-NEXT:    s_mov_b32 s2, 0x83600
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32
+; GFX6-NEXT:    s_mov_b32 s2, 0x83a00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -9905,17 +9898,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:48
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v8
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(3)
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -9938,7 +9930,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT:    s_mov_b32 s6, 0x83200
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[8:15]
 ; GFX6-NEXT:    ;;#ASMEND
@@ -9957,6 +9949,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s33
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s6 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s6 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s6 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s6 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
 ; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GFX6-NEXT:    s_mov_b64 vcc, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
@@ -10187,126 +10184,127 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s0, 0x86a00
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:240
+; GFX6-NEXT:    s_mov_b32 s0, 0x86a00
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x86600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x86200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x85e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x85a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x85600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x85200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x84e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x84a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x84600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x84200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83a00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s0, 0x83e00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83a00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
index 1cc5b7f7d14eec6..57496c2be54be41 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
@@ -85,13 +85,13 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 {
 ; GFX908-DAG: v_accvgpr_read_b32
 
 ; GFX900: NumVgprs: 256
-; GFX900: ScratchSize: 132
-; GFX908: NumVgprs: 252
+; GFX900: ScratchSize: 148
+; GFX908: NumVgprs: 254
 ; GFX908: ScratchSize: 0
 ; GFX900:    VGPRBlocks: 63
-; GFX908:    VGPRBlocks: 62
+; GFX908:    VGPRBlocks: 63
 ; GFX900:    NumVGPRsForWavesPerEU: 256
-; GFX908:    NumVGPRsForWavesPerEU: 252
+; GFX908:    NumVGPRsForWavesPerEU: 254
 define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 8150328dd24f039..ef1adbb395e76ef 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -282,43 +282,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; VI-LABEL: ashr_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    v_readfirstlane_b32 s1, v3
-; VI-NEXT:    v_readfirstlane_b32 s2, v0
-; VI-NEXT:    v_readfirstlane_b32 s3, v1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s10, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s11, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s2, s0
-; VI-NEXT:    s_ashr_i32 s2, s9, s11
-; VI-NEXT:    s_ashr_i32 s1, s3, s1
-; VI-NEXT:    s_ashr_i32 s3, s8, s10
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_or_b32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_readfirstlane_b32 s4, v2
+; VI-NEXT:    v_readfirstlane_b32 s5, v3
+; VI-NEXT:    v_readfirstlane_b32 s6, v0
+; VI-NEXT:    v_readfirstlane_b32 s7, v1
+; VI-NEXT:    s_ashr_i32 s8, s7, 16
+; VI-NEXT:    s_sext_i32_i16 s7, s7
+; VI-NEXT:    s_ashr_i32 s9, s6, 16
+; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_ashr_i32 s10, s5, 16
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    s_ashr_i32 s11, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_ashr_i32 s4, s6, s4
+; VI-NEXT:    s_ashr_i32 s6, s9, s11
+; VI-NEXT:    s_ashr_i32 s5, s7, s5
+; VI-NEXT:    s_ashr_i32 s7, s8, s10
+; VI-NEXT:    s_lshl_b32 s7, s7, 16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s5, s5, s7
+; VI-NEXT:    s_or_b32 s4, s4, s6
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: ashr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 530226baa775e1b..0d682a6627a1af0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2509,9 +2509,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, v3
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v3
 ; VI-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
 ; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 7
@@ -2530,9 +2528,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v3
 ; GCN-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 7
@@ -2548,10 +2544,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x71b47843, v4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0x71b47843, v5, v[2:3]
 ; GFX1030-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xa7c5ac4, v4, v[2:3]
-; GFX1030-NEXT:    v_mov_b32_e32 v1, v3
-; GFX1030-NEXT:    v_add_co_u32 v0, s4, v0, v1
+; GFX1030-NEXT:    v_add_co_u32 v0, s4, v1, v3
 ; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s4
 ; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0xa7c5ac4, v5, v[0:1]
 ; GFX1030-NEXT:    v_alignbit_b32 v0, v1, v0, 7
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index c6cc479b5deb1ee..5360ff2fa402f14 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -2411,7 +2411,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -2435,7 +2434,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2457,7 +2455,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -2480,8 +2477,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -2502,8 +2498,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX12-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm



More information about the llvm-commits mailing list