[llvm] [GlobalISel][AMDGPU] Import patterns with multiple defs (PR #84171)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 8 00:39:11 PST 2024


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/84171

>From 921fa3d2072f1315c629a4355ebd32db000f6b08 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 6 Mar 2024 14:59:44 +0100
Subject: [PATCH 1/2] [GlobalISel][AMDGPU] Import patterns with multiple defs

Fixes #63216
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  23 +-
 .../AMDGPU/GlobalISel/mul-known-bits.i64.ll   | 162 ++---
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    | 235 ++++---
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 592 +++++++++---------
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 553 ++++++++--------
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    | 305 +++++----
 llvm/utils/TableGen/GlobalISelEmitter.cpp     |  27 +-
 7 files changed, 953 insertions(+), 944 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 334cfad478f151..b12c1c41b62b00 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -728,25 +728,34 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
 def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
 
-// FIXME: GlobalISel in general does not handle instructions with 2 results,
-// so it cannot use these patterns.
 multiclass IMAD32_Pats <VOP3_Pseudo inst> {
   def : GCNPat <
         (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
-        (EXTRACT_SUBREG (inst $src0, $src1,
+        (EXTRACT_SUBREG (inst i32:$src0, i32:$src1,
                               (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
                                             $src2, sub0,
                                             (i32 (IMPLICIT_DEF)), sub1),
                                             0 /* clamp */),
                         sub0)
         >;
+
+  // GISel-specific pattern that avoids creating a SGPR->VGPR copy if
+  // $src2 is a VGPR.
+  def : GCNPat <
+        (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, VGPR_32:$src2),
+        (EXTRACT_SUBREG (inst i32:$src0, i32:$src1,
+                              (REG_SEQUENCE VReg_64,
+                                            $src2, sub0,
+                                            (i32 (IMPLICIT_DEF)), sub1),
+                                            0 /* clamp */),
+                        sub0)
+        >;
+
   // Immediate src2 in the pattern above will not fold because it would be partially
   // undef. Hence define specialized pattern for this case.
-  // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
-  // make it SDAG only.
   def : GCNPat <
-        (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
-        (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
+        (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
+        (EXTRACT_SUBREG (inst i32:$src0, i32:$src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
         >;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index d671a1d87b63df..1140ef88ac7f85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -8,34 +8,35 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-LABEL: v_mul_i64_no_zext:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, v0, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v0, v1
-; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[2:3]
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_no_zext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x2c
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[0:1]
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[0:1], v9, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[2:3], v9, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, 0
-; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v5, v5, v0, v1
-; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v5, v7
+; GFX11-NEXT:    global_store_b64 v9, v[4:5], s[2:3]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -64,8 +65,9 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-NEXT:    global_load_dword v4, v3, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v0, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v4, v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -79,12 +81,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v1, s[6:7]
-; GFX11-NEXT:    global_load_b32 v4, v2, s[0:1]
+; GFX11-NEXT:    global_load_b32 v5, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v4, 0
-; GFX11-NEXT:    v_mul_lo_u32 v0, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v5, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,8 +117,9 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v0, 0
-; GFX10-NEXT:    v_mul_lo_u32 v0, v4, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -128,13 +132,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v1, s[6:7]
+; GFX11-NEXT:    global_load_b32 v5, v1, s[6:7]
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-NEXT:    v_mul_lo_u32 v0, v4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -211,8 +216,9 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v0, 0
-; GFX10-NEXT:    v_mul_lo_u32 v0, v4, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -225,13 +231,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v4, v0, s[6:7]
+; GFX11-NEXT:    global_load_b32 v5, v0, s[6:7]
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-NEXT:    v_mul_lo_u32 v0, v4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,15 +397,16 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xfff00000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xf00f, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xfff00000, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v4, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v3
-; GFX10-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, v6, v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, v6, v3, v[0:1]
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xf00f, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, v2, v[5:6]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_store_dwordx2 v0, v[4:5], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -412,17 +420,18 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
 ; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
 ; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xfff00000, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xf00f, v1
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xfff00000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
-; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
-; GFX11-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v2, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xf00f, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT:    global_store_b64 v0, v[4:5], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -491,27 +500,31 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v0, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v0, s[2:3]
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s1, v2, v0, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, v2, v4, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2]
+; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:  .LBB10_2: ; %Flow
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s0, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX10-NEXT:  ; %bb.3: ; %if
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:  ; %bb.4: ; %endif
+; GFX10-NEXT:  .LBB10_4: ; %endif
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -526,22 +539,29 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v0, s[6:7]
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[4:5], v0, s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_cmpx_ge_u64_e32 0, v[2:3]
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX11-NEXT:  ; %bb.1: ; %else
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v2, v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
-; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:    v_mov_b32_e32 v1, v3
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:  .LBB10_2: ; %Flow
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:  ; %bb.4: ; %endif
+; GFX11-NEXT:  .LBB10_4: ; %endif
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 4c1935d06517e5..2d81452f9ef38d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -483,19 +483,18 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v3
-; GFX10-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
-; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
-; GFX11-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GFX11-NEXT:    v_add3_u32 v1, v1, v3, v2
+; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v5, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i64:
@@ -506,11 +505,10 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
-; GFX12-NEXT:    v_mul_lo_u32 v3, v0, v3
-; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5]
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add3_u32 v1, v4, v3, v1
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i64 %num, %den
   ret i64 %result
@@ -653,11 +651,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v1
-; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v3
-; GFX10-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GFX10-NEXT:    v_mul_lo_u32 v8, v7, v4
+; GFX10-NEXT:    v_mul_lo_u32 v0, v6, v5
+; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
-; GFX10-NEXT:    v_add3_u32 v2, v5, v8, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -666,11 +664,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v3
-; GFX11-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GFX11-NEXT:    v_mul_lo_u32 v8, v7, v4
+; GFX11-NEXT:    v_mul_lo_u32 v0, v6, v5
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
-; GFX11-NEXT:    v_add3_u32 v2, v5, v8, v2
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
+; GFX11-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -683,15 +681,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX12-NEXT:    v_mul_lo_u32 v2, v2, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GFX12-NEXT:    v_mul_lo_u32 v8, v7, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mul_lo_u32 v0, v6, v5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v2, v5, v8, v2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
+; GFX12-NEXT:    v_mov_b32_e32 v2, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i96 %num, %den
@@ -978,7 +976,6 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v4
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v6, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
@@ -987,32 +984,31 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v11
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT:    v_mul_lo_u32 v5, v10, v5
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v3, v4, v5, v3
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
-; GFX11-NEXT:    v_mov_b32_e32 v10, v2
-; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v4
+; GFX11-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
+; GFX11-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
-; GFX11-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX11-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v4, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12]
-; GFX11-NEXT:    v_mov_b32_e32 v2, v11
+; GFX11-NEXT:    v_mul_lo_u32 v4, v9, v6
+; GFX11-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v11, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT:    v_mul_lo_u32 v5, v10, v5
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v3, v4, v5, v3
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v3, v6, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i128:
@@ -1024,25 +1020,25 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
 ; GFX12-NEXT:    v_mov_b32_e32 v10, v2
-; GFX12-NEXT:    v_mul_lo_u32 v3, v3, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
 ; GFX12-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX12-NEXT:    v_mul_lo_u32 v6, v9, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v11
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX12-NEXT:    v_mul_lo_u32 v5, v10, v5
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v3, v4, v5, v3
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i128 %num, %den
   ret i128 %result
@@ -2248,7 +2244,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_mov_b32_e32 v17, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
 ; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
@@ -2308,78 +2303,78 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v8, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i256:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX11-NEXT:    v_mul_lo_u32 v7, v7, v8
-; GFX11-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
+; GFX11-NEXT:    v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
+; GFX11-NEXT:    v_mul_lo_u32 v30, v4, v11
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], null, v16, v12, 0
-; GFX11-NEXT:    v_mul_lo_u32 v30, v17, v14
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v12, 0
+; GFX11-NEXT:    v_mul_lo_u32 v29, v17, v14
+; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v27, vcc_lo, 0, v24, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX11-NEXT:    v_mov_b32_e32 v20, v22
-; GFX11-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20]
-; GFX11-NEXT:    v_mov_b32_e32 v20, v18
-; GFX11-NEXT:    v_mov_b32_e32 v19, v22
-; GFX11-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v8, 0
-; GFX11-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX11-NEXT:    v_mul_lo_u32 v25, v3, v12
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX11-NEXT:    v_mov_b32_e32 v14, v21
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19]
-; GFX11-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX11-NEXT:    v_mov_b32_e32 v13, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX11-NEXT:    v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14]
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v8, v7
+; GFX11-NEXT:    v_mov_b32_e32 v20, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v21, v22
+; GFX11-NEXT:    v_mul_lo_u32 v22, v6, v9
+; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
+; GFX11-NEXT:    v_mov_b32_e32 v6, v25
+; GFX11-NEXT:    v_mul_lo_u32 v25, v16, v15
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v18, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
+; GFX11-NEXT:    v_mul_lo_u32 v20, v2, v13
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, s2, 0, v8, s2
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
+; GFX11-NEXT:    v_mul_lo_u32 v21, v3, v12
+; GFX11-NEXT:    v_mov_b32_e32 v12, v24
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, s2, 0, v8, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, s4, 0, v10, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s5, v23, v25, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s4, v7, v29, s4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s2, v7, v20, s2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s2, v7, v21, s3
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s1, v7, v30, s1
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v28, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v7, v22, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i256:
@@ -2391,9 +2386,8 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
 ; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
 ; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
@@ -2472,9 +2466,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v7, v8, v7
+; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i256 %num, %den
   ret i256 %result
@@ -2684,23 +2678,22 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v2, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v4, 0x50, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4]
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
-; GFX11-NEXT:    v_mul_lo_u32 v4, 0x50, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
+; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 5297df3bedf8f2..b666f45521661c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -474,97 +474,94 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX10-NEXT:    s_sub_u32 s10, 0, s8
-; GFX10-NEXT:    s_subb_u32 s11, 0, s9
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT:    v_mul_lo_u32 v4, s10, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s10, v3, 0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s11, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
-; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s11, s10, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2]
+; GFX10-NEXT:    s_subb_u32 s11, 0, s9
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2]
+; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, v2, v1
+; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX10-NEXT:    v_add_co_u32 v4, s14, v4, v5
+; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s14, v2, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
 ; GFX10-NEXT:    v_add_co_u32 v6, s14, v7, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s14
-; GFX10-NEXT:    v_add_co_u32 v0, s14, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s14, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s14
-; GFX10-NEXT:    v_add_co_u32 v4, s14, v6, v8
+; GFX10-NEXT:    v_add_co_u32 v2, s14, v6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s14, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s14
+; GFX10-NEXT:    v_add_co_u32 v0, s14, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s14
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v0
-; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s11, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s10, v3, 0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v2
-; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v4
-; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2]
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2]
+; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, v2, v1
+; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX10-NEXT:    v_add_co_u32 v4, s10, v4, v5
+; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
 ; GFX10-NEXT:    v_add_co_u32 v6, s10, v7, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v4, s10, v6, v8
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
-; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v1
-; GFX10-NEXT:    v_mul_hi_u32 v4, s1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v1
-; GFX10-NEXT:    v_mul_hi_u32 v6, s0, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v4, s10, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT:    v_add_co_u32 v2, s10, v4, v6
+; GFX10-NEXT:    v_mul_hi_u32 v3, s0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v5, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v3, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT:    v_mul_lo_u32 v4, s9, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s10, s8, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v2, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v5, s10, v0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s10, s8, v5, 0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v0
@@ -578,8 +575,8 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v13, s0, v4, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v5, s0
+; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v1
@@ -587,14 +584,14 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
 ; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
@@ -1902,25 +1899,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    s_xor_b64 s[6:7], s[0:1], s[16:17]
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX10-NEXT:    s_sub_u32 s20, 0, s6
-; GFX10-NEXT:    s_subb_u32 s21, 0, s7
+; GFX10-NEXT:    s_sub_u32 s21, 0, s6
+; GFX10-NEXT:    s_subb_u32 s20, 0, s7
 ; GFX10-NEXT:    s_ashr_i32 s12, s15, 31
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX10-NEXT:    s_xor_b64 s[18:19], s[4:5], s[16:17]
 ; GFX10-NEXT:    s_ashr_i32 s16, s3, 31
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_add_u32 s14, s14, s12
 ; GFX10-NEXT:    s_addc_u32 s15, s15, s12
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_add_u32 s2, s2, s16
 ; GFX10-NEXT:    s_mov_b32 s17, s16
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s16
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
 ; GFX10-NEXT:    s_mov_b32 s13, s12
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1929,259 +1926,256 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v7, s20, v5
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v4
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s20, v6, 0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s21, v6
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v4
+; GFX10-NEXT:    v_trunc_f32_e32 v6, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v6
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s21, v7, 0
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    s_sub_u32 s5, 0, s2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v3
+; GFX10-NEXT:    v_mul_hi_u32 v10, v9, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s22, s5, v8, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2]
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
+; GFX10-NEXT:    v_mul_hi_u32 v6, v7, v0
 ; GFX10-NEXT:    s_subb_u32 s22, 0, s3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX10-NEXT:    v_mul_lo_u32 v9, s5, v3
-; GFX10-NEXT:    v_add3_u32 v7, v1, v7, v8
-; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v11, v6, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s23, s5, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s22, v4
-; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v7
-; GFX10-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GFX10-NEXT:    v_mul_lo_u32 v13, v5, v7
-; GFX10-NEXT:    v_mul_hi_u32 v14, v6, v7
-; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX10-NEXT:    v_add3_u32 v2, v2, v9, v8
-; GFX10-NEXT:    v_add_co_u32 v10, s23, v10, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v0, s23, v13, v0
-; GFX10-NEXT:    v_mul_lo_u32 v8, v3, v1
+; GFX10-NEXT:    v_mul_hi_u32 v12, v8, v2
+; GFX10-NEXT:    v_mul_lo_u32 v11, v5, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5]
+; GFX10-NEXT:    v_mul_lo_u32 v4, v9, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2]
+; GFX10-NEXT:    v_mul_hi_u32 v2, v5, v2
+; GFX10-NEXT:    v_mul_lo_u32 v13, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v14, v9, v3
+; GFX10-NEXT:    v_mul_hi_u32 v15, v7, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1]
+; GFX10-NEXT:    v_mul_hi_u32 v1, v9, v3
+; GFX10-NEXT:    v_add_co_u32 v3, s23, v4, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v10, s23, v14, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
-; GFX10-NEXT:    v_mul_lo_u32 v15, v4, v2
-; GFX10-NEXT:    v_add_co_u32 v10, s23, v10, v11
-; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, v3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v14, v8, v0
+; GFX10-NEXT:    v_add_co_u32 v3, s23, v3, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v6, s23, v10, v15
+; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v0, s23, v0, v14
-; GFX10-NEXT:    v_mul_lo_u32 v14, v3, v2
+; GFX10-NEXT:    v_mul_hi_u32 v16, v8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v17, v5, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v4, v3
+; GFX10-NEXT:    v_add_co_u32 v4, s23, v11, v14
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v13, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v2, s23, v15, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v12, v10
-; GFX10-NEXT:    v_add_co_u32 v8, s23, v8, v15
+; GFX10-NEXT:    v_add_co_u32 v0, s23, v6, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v4, s23, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v16
+; GFX10-NEXT:    v_add3_u32 v1, v3, v6, v1
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v7, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v10, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
-; GFX10-NEXT:    v_mul_hi_u32 v16, v4, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v13, v11
-; GFX10-NEXT:    v_add_co_u32 v1, s23, v14, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v0, s23, v0, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v8, s23, v8, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v9, s23, v1, v16
-; GFX10-NEXT:    v_add3_u32 v7, v11, v10, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v12, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s20, v6, 0
-; GFX10-NEXT:    v_add_co_u32 v7, s23, v9, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s23
-; GFX10-NEXT:    v_mul_lo_u32 v9, s21, v6
-; GFX10-NEXT:    v_mul_lo_u32 v11, s20, v5
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v7
-; GFX10-NEXT:    v_add3_u32 v2, v10, v8, v2
-; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v10, v6, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GFX10-NEXT:    v_add3_u32 v7, v1, v11, v9
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s20, s5, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v7
-; GFX10-NEXT:    v_mul_lo_u32 v9, s22, v4
-; GFX10-NEXT:    v_mul_lo_u32 v11, s5, v3
-; GFX10-NEXT:    v_mul_lo_u32 v13, v5, v7
-; GFX10-NEXT:    v_mul_hi_u32 v14, v6, v7
-; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX10-NEXT:    v_mul_lo_u32 v15, v3, v1
-; GFX10-NEXT:    v_mul_hi_u32 v16, v4, v1
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v12
-; GFX10-NEXT:    v_add3_u32 v2, v2, v11, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v13, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s21, v6, 0
+; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v11, v7, v0
+; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v17
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s23, s5, v8, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
+; GFX10-NEXT:    v_mul_lo_u32 v12, v9, v2
+; GFX10-NEXT:    v_mul_hi_u32 v13, v8, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5]
+; GFX10-NEXT:    v_mul_lo_u32 v4, v7, v0
+; GFX10-NEXT:    v_mul_hi_u32 v5, v6, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2]
+; GFX10-NEXT:    v_mul_hi_u32 v2, v9, v2
+; GFX10-NEXT:    v_mul_lo_u32 v14, v6, v3
+; GFX10-NEXT:    v_mul_lo_u32 v15, v7, v3
+; GFX10-NEXT:    v_mul_hi_u32 v16, v6, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1]
+; GFX10-NEXT:    v_mul_hi_u32 v1, v7, v3
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v4, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v11, s5, v15, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v5
+; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v5, s5, v11, v16
+; GFX10-NEXT:    v_mul_lo_u32 v16, v9, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v10
-; GFX10-NEXT:    v_mul_lo_u32 v12, v4, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
-; GFX10-NEXT:    v_mul_hi_u32 v1, v3, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v9, v8
-; GFX10-NEXT:    v_mul_lo_u32 v13, v3, v2
-; GFX10-NEXT:    v_mul_hi_u32 v14, v4, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v11, v10
-; GFX10-NEXT:    v_add_co_u32 v10, s5, v15, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v13, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v14, v11
+; GFX10-NEXT:    v_add_co_u32 v11, s5, v12, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v10, s5, v10, v16
-; GFX10-NEXT:    v_add3_u32 v7, v9, v8, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v14
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v10
-; GFX10-NEXT:    v_mul_hi_u32 v9, s0, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v12, v8
-; GFX10-NEXT:    v_mul_lo_u32 v8, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v10, s0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s1, v5
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v6
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v16, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v5, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v11, s5, v11, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v17
+; GFX10-NEXT:    v_add3_u32 v1, v4, v5, v1
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v14, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v11, s0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX10-NEXT:    v_mul_lo_u32 v12, s1, v1
+; GFX10-NEXT:    v_add3_u32 v0, v5, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v1
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v6, v11
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v11, v0
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v12, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v12
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v7
+; GFX10-NEXT:    v_mul_lo_u32 v0, s15, v2
+; GFX10-NEXT:    v_mul_lo_u32 v12, s14, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v4
+; GFX10-NEXT:    v_mul_hi_u32 v9, s14, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX10-NEXT:    v_mul_lo_u32 v7, s15, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v6, v1
+; GFX10-NEXT:    v_add_co_u32 v6, s5, v0, v12
+; GFX10-NEXT:    v_mul_hi_u32 v13, s14, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v12, s5, v3, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s20, v7, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v12, 0
+; GFX10-NEXT:    v_add_co_u32 v6, s5, v6, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v9, s5, v2, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_add3_u32 v4, v4, v7, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v6
+; GFX10-NEXT:    v_mul_hi_u32 v5, s15, v8
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v12, 1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v6, s5, v9, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v10, v8
-; GFX10-NEXT:    v_add3_u32 v2, v7, v6, v2
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v9
-; GFX10-NEXT:    v_add_co_u32 v7, s5, v0, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v8, s7, v7
-; GFX10-NEXT:    v_mul_lo_u32 v3, s15, v4
-; GFX10-NEXT:    v_add3_u32 v5, v6, v0, v5
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v7, 0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s14, v2
-; GFX10-NEXT:    v_mul_hi_u32 v10, s14, v4
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v5
-; GFX10-NEXT:    v_mul_hi_u32 v4, s15, v4
-; GFX10-NEXT:    v_mul_lo_u32 v11, s15, v2
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v6
-; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, s0, v0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v7, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v7, 1
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2]
+; GFX10-NEXT:    v_add3_u32 v5, v3, v9, v5
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s5, s2, v6, 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, s1, v1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s0, 0, v5, s0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v12, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v6
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v4, s1, v11, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v6, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v12
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s0
+; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v14, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v14
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v10
-; GFX10-NEXT:    v_mul_hi_u32 v10, s14, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v19, v18, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v17, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v16, v13, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v16, s0, v0, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v8, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, v4, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v11
-; GFX10-NEXT:    v_mul_lo_u32 v13, s3, v3
-; GFX10-NEXT:    v_add3_u32 v2, v10, v4, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v0, v16, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v17, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s2, v3, 0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v2
-; GFX10-NEXT:    v_sub_co_u32 v10, s1, v14, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s1, 0, v9, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v14, v10, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v20, v17, s0
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v3, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v1, v1, v8, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v15, v9, s0
-; GFX10-NEXT:    v_sub_co_u32 v8, s0, s14, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s1, s15, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v1
-; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v7
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1]
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v9, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s14, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s1, s15, v0, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s15, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v8
+; GFX10-NEXT:    v_xor_b32_e32 v1, s18, v1
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v4, s19, v4
-; GFX10-NEXT:    v_xor_b32_e32 v5, s4, v5
-; GFX10-NEXT:    v_mov_b32_e32 v16, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, vcc_lo, s3, v1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_xor_b32_e32 v3, s4, v3
+; GFX10-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v2, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v1, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v9
-; GFX10-NEXT:    v_xor_b32_e32 v4, s4, v6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v11, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v14, s0, v3, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s0
-; GFX10-NEXT:    v_add_co_u32 v11, s0, v14, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v15, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_sub_co_u32 v7, s0, v12, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v14, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v12, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v10, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v9, v12, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
+; GFX10-NEXT:    v_add_co_u32 v15, s0, v6, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v5, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_sub_co_u32 v9, s0, v13, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v5, v15, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s0
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v5, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v3
-; GFX10-NEXT:    v_xor_b32_e32 v8, s1, v2
-; GFX10-NEXT:    v_xor_b32_e32 v6, s12, v6
-; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v7
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s4
+; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v6
+; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v11
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v2
+; GFX10-NEXT:    v_xor_b32_e32 v8, s12, v8
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v8, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
-; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[8:9]
-; GFX10-NEXT:    global_store_dwordx4 v16, v[4:7], s[10:11]
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v8, vcc_lo
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[8:9]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[10:11]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 8b4e218f78948b..a58397eccaba76 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -389,97 +389,94 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX10-NEXT:    s_sub_u32 s0, 0, s10
-; GFX10-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT:    v_mul_lo_u32 v4, s0, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v3, 0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
-; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s0, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
+; GFX10-NEXT:    s_subb_u32 s1, 0, s11
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
+; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, v2, v1
+; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX10-NEXT:    v_add_co_u32 v4, s2, v4, v5
+; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s2, v2, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s2
 ; GFX10-NEXT:    v_add_co_u32 v6, s2, v7, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s2
-; GFX10-NEXT:    v_add_co_u32 v0, s2, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s2, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT:    v_add_co_u32 v4, s2, v6, v8
+; GFX10-NEXT:    v_add_co_u32 v2, s2, v6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s2, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v0, s2, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v0
-; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s1, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v3, 0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v4
-; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s0, v4, v[1:2]
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s1, v3, v[1:2]
+; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, v2, v1
+; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, v2, v1
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v5
+; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v6, v8
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v6, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
-; GFX10-NEXT:    v_add3_u32 v1, v5, v4, v1
-; GFX10-NEXT:    v_mul_hi_u32 v4, s9, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s9, v1
-; GFX10-NEXT:    v_mul_hi_u32 v6, s8, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v4, v6
+; GFX10-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v5, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v3, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v4, s11, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v0, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s10, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v3
-; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v2, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, s9, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s10, v5, 0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s8, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v1
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v7
@@ -493,8 +490,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v13, s0, v4, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v5, s0
+; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v8
@@ -502,14 +499,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s0
 ; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s10
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v4, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s0
 ; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
@@ -1523,8 +1520,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_subb_u32 s1, 0, s13
-; GFX10-NEXT:    s_sub_u32 s2, 0, s14
-; GFX10-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1533,243 +1528,239 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v5, 0xcf800000, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v3
-; GFX10-NEXT:    v_add_f32_e32 v0, v4, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX10-NEXT:    v_mul_lo_u32 v10, s2, v6
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v4, v2
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v5
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, s0, v4
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s6, s0, v5, 0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s1, v5
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s6, s2, v8, 0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v8
-; GFX10-NEXT:    v_add3_u32 v1, v1, v7, v9
-; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v0
-; GFX10-NEXT:    v_mul_hi_u32 v9, v5, v0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v10, v11
-; GFX10-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX10-NEXT:    v_mul_lo_u32 v12, v5, v1
-; GFX10-NEXT:    v_mul_lo_u32 v13, v4, v1
-; GFX10-NEXT:    v_mul_lo_u32 v10, v6, v2
-; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v11, v8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, v6, v2
-; GFX10-NEXT:    v_mul_lo_u32 v16, v6, v3
-; GFX10-NEXT:    v_mul_hi_u32 v14, v5, v1
-; GFX10-NEXT:    v_add_co_u32 v7, s6, v7, v12
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v7, 0
+; GFX10-NEXT:    s_sub_u32 s2, 0, s14
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s3, s2, v8, 0
+; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
+; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v0
+; GFX10-NEXT:    s_subb_u32 s3, 0, s15
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5]
+; GFX10-NEXT:    v_mul_hi_u32 v4, v7, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6]
+; GFX10-NEXT:    v_mul_lo_u32 v1, v10, v2
+; GFX10-NEXT:    v_mul_hi_u32 v5, v8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v10, v2
+; GFX10-NEXT:    v_mul_lo_u32 v12, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v13, v9, v3
+; GFX10-NEXT:    v_mul_hi_u32 v14, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v16, v10, v0
+; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v3, v9, v3
+; GFX10-NEXT:    v_mul_hi_u32 v0, v10, v0
+; GFX10-NEXT:    v_add_co_u32 v6, s6, v6, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v13, v0
+; GFX10-NEXT:    v_add_co_u32 v11, s6, v13, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v10, s6, v10, v15
+; GFX10-NEXT:    v_add_co_u32 v1, s6, v1, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s6
 ; GFX10-NEXT:    v_add_co_u32 v2, s6, v16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v7, s6, v7, v9
-; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v10, s6, v10, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v12, v7
-; GFX10-NEXT:    v_add_co_u32 v2, s6, v2, v17
-; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v15, v10
+; GFX10-NEXT:    v_add_co_u32 v4, s6, v6, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v6, s6, v11, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
-; GFX10-NEXT:    v_add_co_u32 v0, s6, v0, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v13, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
-; GFX10-NEXT:    v_mul_hi_u32 v3, v6, v3
-; GFX10-NEXT:    v_add_co_u32 v2, s6, v2, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v16, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
-; GFX10-NEXT:    v_add3_u32 v1, v9, v7, v1
-; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v5, v0
-; GFX10-NEXT:    v_add3_u32 v3, v11, v10, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v3, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s6, s0, v5, 0
-; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v5
-; GFX10-NEXT:    v_mul_lo_u32 v9, s0, v4
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s2, v8, 0
-; GFX10-NEXT:    v_mul_lo_u32 v10, s3, v8
-; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v6
-; GFX10-NEXT:    v_mul_lo_u32 v12, v4, v0
-; GFX10-NEXT:    v_mul_hi_u32 v13, v5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v7
-; GFX10-NEXT:    v_mul_lo_u32 v7, v6, v2
-; GFX10-NEXT:    v_mul_hi_u32 v9, v8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, v6, v2
-; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v10
-; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v1
-; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v1
-; GFX10-NEXT:    v_mul_hi_u32 v14, v5, v1
+; GFX10-NEXT:    v_add_co_u32 v1, s6, v1, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v2, s6, v2, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v4, s6, v6, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v15, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v13, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v16, v5
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v4
+; GFX10-NEXT:    v_add_co_u32 v1, s6, v2, v1
+; GFX10-NEXT:    v_add3_u32 v3, v11, v6, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v2, v5, v2, v0
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s6, s0, v7, 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s6, s2, v8, 0
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v3
-; GFX10-NEXT:    v_mul_lo_u32 v16, v6, v3
-; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v12, v10
+; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
+; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s1, v7, v[4:5]
+; GFX10-NEXT:    v_mul_hi_u32 v4, v7, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s3, v8, v[5:6]
+; GFX10-NEXT:    v_mul_lo_u32 v1, v10, v2
+; GFX10-NEXT:    v_mul_hi_u32 v5, v8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v10, v2
+; GFX10-NEXT:    v_mul_lo_u32 v12, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v13, v9, v3
+; GFX10-NEXT:    v_mul_hi_u32 v14, v7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v16, v10, v0
+; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v3, v9, v3
+; GFX10-NEXT:    v_mul_hi_u32 v0, v10, v0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v11, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v15
+; GFX10-NEXT:    v_add_co_u32 v11, s0, v13, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v10, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v6, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v11, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v15, v7
-; GFX10-NEXT:    v_mul_hi_u32 v3, v6, v3
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v5, v0
-; GFX10-NEXT:    v_add3_u32 v1, v11, v9, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v16, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v15, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v6, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v13, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v16, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_add3_u32 v3, v11, v6, v3
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v7, v4
+; GFX10-NEXT:    v_add3_u32 v0, v5, v2, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v8, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mul_lo_u32 v4, s9, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v0
-; GFX10-NEXT:    v_add3_u32 v3, v10, v7, v3
-; GFX10-NEXT:    v_mul_lo_u32 v7, s8, v1
-; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v1
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v6, v3, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v6, s8, v1
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s9, v4
+; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v4
+; GFX10-NEXT:    v_mul_hi_u32 v4, s9, v4
+; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v2
+; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v11, s9, v2
+; GFX10-NEXT:    v_mul_lo_u32 v2, s10, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s10, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, s11, v0
+; GFX10-NEXT:    v_mul_hi_u32 v13, s10, v0
+; GFX10-NEXT:    v_mul_hi_u32 v14, s11, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v3, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v9, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v7, v4
-; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT:    v_mul_hi_u32 v8, s10, v2
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v0, v4
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v6, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GFX10-NEXT:    v_mul_lo_u32 v9, s11, v3
-; GFX10-NEXT:    v_mul_hi_u32 v10, s10, v3
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT:    v_add3_u32 v5, v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v3, v0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s12, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s13, v4
-; GFX10-NEXT:    v_mul_lo_u32 v12, s12, v5
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v9, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
+; GFX10-NEXT:    v_add_co_u32 v8, s0, v4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v1, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s12, v8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add3_u32 v1, v1, v12, v11
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v7, v6
-; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v8
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s9, v1
-; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, s8, v0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s0, s9, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s12, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v9, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v16, s0, v4, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v5, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v15, v14, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v16, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v17, s0
-; GFX10-NEXT:    v_add3_u32 v3, v7, v1, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v0, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s14, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v18, s14, v3
-; GFX10-NEXT:    v_mul_lo_u32 v19, s15, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_sub_co_u32 v14, s0, v8, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v13
-; GFX10-NEXT:    v_add3_u32 v16, v1, v18, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, s1, s10, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s11, v16
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v17, s2, s11, v16, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v15, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v14, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, s15, v4, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v13
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v9, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s1
-; GFX10-NEXT:    v_sub_co_u32 v15, s1, v13, s14
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s2, 0, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, s14, v10, 0
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v7
+; GFX10-NEXT:    v_add3_u32 v9, v5, v4, v11
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v8, 1
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_add3_u32 v7, v7, v6, v14
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4]
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5]
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v12, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s8, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6]
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v14
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s9, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s0
+; GFX10-NEXT:    v_sub_co_u32 v15, s0, s10, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v2, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v12, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, vcc_lo, s15, v8, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s11, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s13, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s12, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s13, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v21, v20, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v13, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v17, v8, s1
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v17, v0, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v10, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s14
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v14, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v9, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v13, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v6, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v8, s1
+; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 8874240fae8dc7..526ee5a51745d3 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -4418,30 +4418,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v6, v4, v3
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX10-GISEL-NEXT:    v_add3_u32 v1, v1, v6, v7
-; GFX10-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v4
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v1, v5, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v2, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v3, v6, v3
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v7, v2
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX10-GISEL-NEXT:    v_add3_u32 v5, v5, v3, v2
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX10-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, 1
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v5, v0
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX10-GISEL-NEXT:    v_add3_u32 v3, v3, v1, v0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v4
-; GFX10-GISEL-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX10-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v6
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v2, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v4, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v8, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v5, 1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v7, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v6, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -4479,38 +4477,36 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v0, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v2, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v2, 0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v5, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v7
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v6, v11, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-GISEL-NEXT:    v_add_co_u32 v9, vcc_lo, v6, 1
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v6, v4, v3
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_add3_u32 v1, v1, v6, v7
-; GFX11-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v1, v5, vcc_lo
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v6, v2, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v3, v6, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v2, v7, v2
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_add3_u32 v5, v5, v3, v2
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v5, v0
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX11-GISEL-NEXT:    v_add3_u32 v3, v3, v1, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v4
-; GFX11-GISEL-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v9, 0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add i64 %x, 1
@@ -5116,54 +5112,51 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v4, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v13, v9, v4
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v6, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v14, v10, v7
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v15, v11, v6
-; GFX10-GISEL-NEXT:    v_add3_u32 v1, v1, v12, v13
-; GFX10-GISEL-NEXT:    v_add3_u32 v12, v3, v14, v15
-; GFX10-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v0, v8
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v1, v9, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v10
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v12, v11, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v3, v4, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v13, v13, v4
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v10, v6, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v7, v10, v7
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v6, v11, v6
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 1
-; GFX10-GISEL-NEXT:    v_add3_u32 v9, v9, v5, v13
-; GFX10-GISEL-NEXT:    v_add3_u32 v10, v4, v7, v6
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v12, vcc_lo
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v8, v0, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v8, v1
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v9, v0
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v3, v2, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v11, v3, v11
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v10, v2
-; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v3, 1
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v10, vcc_lo
-; GFX10-GISEL-NEXT:    v_add3_u32 v3, v5, v1, v0
-; GFX10-GISEL-NEXT:    v_add3_u32 v5, v7, v11, v2
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v8, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v7, v3, v8
-; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v12, 0
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v10
-; GFX10-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v12
-; GFX10-GISEL-NEXT:    v_add3_u32 v1, v1, v4, v7
-; GFX10-GISEL-NEXT:    v_add3_u32 v3, v3, v6, v5
+; GFX10-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v2, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v12, v4, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, 0, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v14, v6, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v0, v12
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v3, v4, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v10, v13, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v2, v14
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v17, v6, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v8, v15, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v19, vcc_lo, v0, 1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v14
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v20, vcc_lo, 0, v10, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v11, v19, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v2, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2]
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v8, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v13, v15, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v11, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, v8
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2]
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, 0, v4, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7]
+; GFX10-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v13, 1
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v9, v14, 0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, 0, v5, vcc_lo
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v7, v17, 0
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6]
+; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i64:
@@ -5224,60 +5217,62 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v4, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v13, v9, v4
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v6, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v14, v10, v7
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v15, v11, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add3_u32 v1, v1, v12, v13
-; GFX11-GISEL-NEXT:    v_add3_u32 v12, v3, v14, v15
-; GFX11-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v0, v8
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v1, v9, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v10
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v12, v11, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v13, vcc_lo, v0, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v15, vcc_lo, v2, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v13, v4, 0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, 0, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v15, v6, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v3, vcc_lo, v0, v13
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10]
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v10, v14, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v18, vcc_lo, v2, v15
 ; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v3, v4, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v5, v3, v5
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v13, v13, v4
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v10, v6, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v7, v10, v7
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v6, v11, v6
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 1
-; GFX11-GISEL-NEXT:    v_add3_u32 v9, v9, v5, v13
-; GFX11-GISEL-NEXT:    v_add3_u32 v10, v4, v7, v6
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, 0, v12, vcc_lo
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v8, v0, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v1, v8, v1
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v0, v9, v0
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v3, v2, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v11, v3, v11
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v2, v10, v2
-; GFX11-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_co_u32 v12, vcc_lo, v3, 1
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, 0, v10, vcc_lo
-; GFX11-GISEL-NEXT:    v_add3_u32 v3, v5, v1, v0
-; GFX11-GISEL-NEXT:    v_add3_u32 v5, v7, v11, v2
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v8, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v7, v3, v8
-; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v12, 0
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v10
-; GFX11-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v12
-; GFX11-GISEL-NEXT:    v_add3_u32 v1, v1, v4, v7
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v11, v16, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v18, v6, 0
+; GFX11-GISEL-NEXT:    v_add_co_u32 v20, vcc_lo, v0, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v21, vcc_lo, 0, v10, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v9
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[9:10], null, v8, v20, 0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v18, vcc_lo, v2, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v11, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[4:5], null, v12, v18, 0
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1]
+; GFX11-GISEL-NEXT:    v_add_co_u32 v14, vcc_lo, v8, 1
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v17, vcc_lo, v12, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v9, v14, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v17, 0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, 0, v10, vcc_lo
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8]
+; GFX11-GISEL-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add3_u32 v3, v3, v6, v5
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i64> %x, <i64 1, i64 1>
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 6ca6cfb18ccdb8..e86057422cd78e 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -295,9 +295,8 @@ static Expected<LLTCodeGen> getInstResultType(const TreePatternNode &Dst,
   // below, we only expect one explicit def here.
   assert(Dst.getOperator()->isSubClassOf("Instruction"));
   CodeGenInstruction &InstInfo = Target.getInstruction(Dst.getOperator());
-  if (InstInfo.Operands.NumDefs != 1)
-    return failedImport(
-        "Dst pattern child only supported with exactly one result");
+  if (!InstInfo.Operands.NumDefs)
+    return failedImport("Dst pattern child needs a def");
 
   ArrayRef<TypeSetByHwMode> ChildTypes = Dst.getExtTypes();
   if (ChildTypes.size() < 1)
@@ -408,9 +407,11 @@ class GlobalISelEmitter final : public GlobalISelMatchTableExecutorEmitter {
   createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
                             const TreePatternNode &Dst);
 
-  Expected<action_iterator> importExplicitDefRenderers(
-      action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-      const TreePatternNode &Src, const TreePatternNode &Dst);
+  Expected<action_iterator>
+  importExplicitDefRenderers(action_iterator InsertPt, RuleMatcher &M,
+                             BuildMIAction &DstMIBuilder,
+                             const TreePatternNode &Src,
+                             const TreePatternNode &Dst, unsigned Start = 0);
 
   Expected<action_iterator> importExplicitUseRenderers(
       action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
@@ -1379,6 +1380,14 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   // Assign the result to TempReg.
   DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID, true);
 
+  // Handle additional (ignored) results.
+  if (DstMIBuilder.getCGI()->Operands.NumDefs > 1) {
+    InsertPtOrError = importExplicitDefRenderers(
+        std::prev(*InsertPtOrError), M, DstMIBuilder, Src, Dst, /*Start=*/1);
+    if (auto Error = InsertPtOrError.takeError())
+      return std::move(Error);
+  }
+
   InsertPtOrError = importExplicitUseRenderers(InsertPtOrError.get(), M,
                                                DstMIBuilder, Dst, Src);
   if (auto Error = InsertPtOrError.takeError())
@@ -1507,14 +1516,14 @@ Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const TreePatternNode &Src, const TreePatternNode &Dst) {
+    const TreePatternNode &Src, const TreePatternNode &Dst, unsigned Start) {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
   const unsigned SrcNumDefs = Src.getExtTypes().size();
   const unsigned DstNumDefs = DstI->Operands.NumDefs;
   if (DstNumDefs == 0)
     return InsertPt;
 
-  for (unsigned I = 0; I < SrcNumDefs; ++I) {
+  for (unsigned I = Start; I < SrcNumDefs; ++I) {
     std::string OpName = getMangledRootDefName(DstI->Operands[I].Name);
     // CopyRenderer saves a StringRef, so cannot pass OpName itself -
     // let's use a string with an appropriate lifetime.
@@ -1800,8 +1809,6 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) {
   // Don't want to try and infer things when there could potentially be more
   // than one candidate register class.
   auto &Inst = Target.getInstruction(OpRec);
-  if (Inst.Operands.NumDefs > 1)
-    return std::nullopt;
 
   // Handle any special-case instructions which we can safely infer register
   // classes from.

>From cb3c7ed6bf9bd4939f50021b04e2f4db37a7ea2b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 8 Mar 2024 09:37:50 +0100
Subject: [PATCH 2/2] fix test

---
 ...lobalISelEmitter-multiple-output-reject.td | 14 ---------
 .../GlobalISelEmitter-multiple-output.td      | 30 +++++++++++++++++++
 2 files changed, 30 insertions(+), 14 deletions(-)
 delete mode 100644 llvm/test/TableGen/GlobalISelEmitter-multiple-output-reject.td

diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-reject.td b/llvm/test/TableGen/GlobalISelEmitter-multiple-output-reject.td
deleted file mode 100644
index bce2d8cb096bbc..00000000000000
--- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-reject.td
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s 2>&1 | FileCheck %s
-
-include "llvm/Target/Target.td"
-include "GlobalISelEmitterCommon.td"
-
-// Test when the inner instruction in the output pattern has two outs
-
-def TwoOutsInstr : I<(outs GPR32:$out1, GPR32:$out2), (ins GPR32:$src), []>;
-def OtherInstr : I<(outs GPR32:$dst), (ins GPR32:$src), []>;
-
-def : Pat<(i32 (add i32:$src, i32:$src)),
-  (OtherInstr (TwoOutsInstr GPR32:$src))>;
-
-// CHECK: warning: Skipped pattern: Dst pattern child only supported with exactly one result
diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td b/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td
index e063ebd4c2bbdd..94c9f60eabd3bc 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td
+++ b/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td
@@ -117,3 +117,33 @@ def : Pat<(i32 (add i32:$src, i32:$src)),
 // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
 // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+
+//-----------------------------------------------------------------------------
+// Test when the inner instruction in the output pattern has two outs
+
+// CHECK:      GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_ADD),
+// CHECK-NEXT: // MIs[0] DstI[dst]
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: // MIs[0] src
+// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
+// CHECK-NEXT: // MIs[0] src
+// CHECK-NEXT: GIM_CheckIsSameOperand, /*MI*/0, /*OpIdx*/2, /*OtherMI*/0, /*OtherOpIdx*/1,
+// CHECK-NEXT: // (add:{ *:[i32] } i32:{ *:[i32] }:$src, i32:{ *:[i32] }:$src)  =>  (OtherInstr:{ *:[i32] } (TwoOutsInstr:{ *:[i32] }:{ *:[i32] } GPR32:{ *:[i32] }:$src))
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(MyTarget::TwoOutsInstr),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/1, /*TempRegFlags*/GIMT_Encode2(RegState::Define|RegState::Dead),
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/1, // src
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/1,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::OtherInstr),
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // DstI[dst]
+// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0,
+// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0,
+
+def TwoOutsInstr : I<(outs GPR32:$out1, GPR32:$out2), (ins GPR32:$src), []>;
+
+def : Pat<(i32 (add i32:$src, i32:$src)),
+  (OtherInstr (TwoOutsInstr GPR32:$src))>;



More information about the llvm-commits mailing list