[llvm] [GlobalISel] Combine [S,U]SUBO (PR #116489)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Sat Nov 16 22:34:33 PST 2024


https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/116489

>From 86a0535e87599a6086a3f78849d5ed2caadccfbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 16 Nov 2024 16:55:15 +0100
Subject: [PATCH 1/3] [GlobalISel] Combine [S,U]SUBO

We import the llvm.ssub.with.overflow.* Intrinsics, but the Legalizer
also builds them while legalizing other opcodes, see
narrowScalarAddSub.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   3 +
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |  17 +
 .../include/llvm/Target/GlobalISel/Combine.td |  17 +-
 .../GlobalISel/CombinerHelperArtifacts.cpp    |  72 ++
 llvm/lib/Target/AArch64/AArch64Combine.td     |   4 +-
 .../AArch64/GlobalISel/combine-overflow.mir   |  46 +
 llvm/test/CodeGen/AArch64/popcount.ll         |   4 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     | 358 ++++----
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     | 788 +++++++++---------
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     | 291 ++++---
 10 files changed, 874 insertions(+), 726 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b1232a368a3657..55c3b72c8e027f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -939,6 +939,9 @@ class CombinerHelper {
   // merge_values(_, zero) -> zext
   bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  // overflow sub
+  bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 4de14dee190fb3..9e5d4d34f24d2b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -486,6 +486,23 @@ class GAddCarryOut : public GBinOpCarryOut {
   }
 };
 
+/// Represents overflowing sub operations.
+/// G_USUBO, G_SSUBO
+class GSubCarryOut : public GBinOpCarryOut {
+public:
+  bool isSigned() const { return getOpcode() == TargetOpcode::G_SSUBO; }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_USUBO:
+    case TargetOpcode::G_SSUBO:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
 /// Represents overflowing add/sub operations that also consume a carry-in.
 /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
 class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index f8379609bf1d98..b4f1551e965b14 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1385,6 +1385,12 @@ def match_addos : GICombineRule<
         [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def match_subos : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SSUBO, G_USUBO):$root,
+        [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 def match_extract_of_element_undef_vector: GICombineRule <
   (defs root:$root),
   (match (G_IMPLICIT_DEF $vector),
@@ -1901,6 +1907,13 @@ def cmp_combines: GICombineGroup<[
   redundant_binop_in_equality
 ]>;
 
+
+def artifact_combines: GICombineGroup<[
+  merge_combines,
+  match_addos,
+  match_subos
+]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1984,9 +1997,9 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
-    combine_concat_vector, match_addos,
+    combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
-    combine_use_vector_truncate, merge_combines]>;
+    combine_use_vector_truncate, artifact_combines]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 797a1e84e21e35..85c56ee6863a95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -84,3 +84,75 @@ bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
   };
   return true;
 }
+
+bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) {
+  const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
+
+  Register Dst = Subo->getReg(0);
+  Register LHS = Subo->getLHSReg();
+  Register RHS = Subo->getRHSReg();
+  Register Carry = Subo->getCarryOutReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+
+  // Check legality before known bits.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
+      !isConstantLegalOrBeforeLegalizer(CarryTy))
+    return false;
+
+  if (Subo->isSigned()) {
+    // G_SSUBO
+    ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                                       /* IsSigned= */ true);
+    ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                                       /* IsSigned= */ true);
+    switch (KBLHS.signedSubMayOverflow(KBRHS)) {
+    case ConstantRange::OverflowResult::MayOverflow:
+      return false;
+    case ConstantRange::OverflowResult::NeverOverflows: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+        B.buildConstant(Carry, 0);
+      };
+      return true;
+    }
+    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS);
+        B.buildConstant(Carry, 1);
+      };
+      return true;
+    }
+    }
+    return false;
+  }
+
+  // G_USUBO
+  ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                                     /* IsSigned= */ false);
+  ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                                     /* IsSigned= */ false);
+  switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return false;
+  case ConstantRange::OverflowResult::NeverOverflows: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS);
+      B.buildConstant(Carry, 1);
+    };
+    return true;
+  }
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 8af8cdfeba6ac4..23563bf9b7881f 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -320,7 +320,7 @@ def AArch64PostLegalizerCombiner
                         hoist_logic_op_with_same_opcode_hands,
                         redundant_and, xor_of_and_with_same_reg,
                         extractvecelt_pairwise_add, redundant_or,
-                        mul_const, redundant_sext_inreg,
+                        mul_const, redundant_sext_inreg, artifact_combines,
                         form_bitfield_extract, rotate_out_of_range,
                         icmp_to_true_false_known_bits,
                         select_combines, fold_merge_to_zext,
@@ -328,7 +328,7 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs, merge_combines,
+                        commute_constant_to_rhs,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index bc4b5ae7c066a6..87b30f558539c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -176,3 +176,49 @@ body:             |
     $q1 = COPY %o_wide
     RET_ReallyLR implicit $w0
 ...
+---
+name:            sub_may
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sub_may
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_SSUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s1) = G_SSUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            usub_may
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s1) = G_USUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s1) = G_USUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 1fc4de1c48b7dd..f9f1cd4b1fcf76 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -113,9 +113,9 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ;
 ; GISEL-LABEL: popcount256:
 ; GISEL:       // %bb.0: // %Entry
-; GISEL-NEXT:    ldp x8, x9, [x0, #16]
+; GISEL-NEXT:    ldp x8, x9, [x0]
 ; GISEL-NEXT:    mov v0.d[0], x8
-; GISEL-NEXT:    ldp x8, x10, [x0]
+; GISEL-NEXT:    ldp x8, x10, [x0, #16]
 ; GISEL-NEXT:    mov v1.d[0], x8
 ; GISEL-NEXT:    mov v0.d[1], x9
 ; GISEL-NEXT:    mov v1.d[1], x10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 613c73f7b9368b..14b30e0d79946c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1178,212 +1178,212 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v17, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v18, v13
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v16
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9]
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v12, v14, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v9
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v16, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v6
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v8
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1392,23 +1392,23 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d5e22df59ccb37..ee7a040e41fd5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1106,210 +1106,210 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xfffff000
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x1000
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v4
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v15, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v18, v13
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v9
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v6
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v8
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1321,18 +1321,18 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1699,210 +1699,210 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v7, v6, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v0, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v8, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v14
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v13
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v4
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v16, v18, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v15, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v18, v13
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    s_mov_b32 s6, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v7, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v9
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v11, v8
+; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v5
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v6
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_xor_b32_e32 v10, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v8
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1914,18 +1914,18 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -3194,59 +3194,59 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v4
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v12
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v17, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v4
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 0, v12
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v0
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v10, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v5
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v14, v4
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v17, v0, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1]
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v4, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2]
+; GISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 0, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
 ; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
@@ -3274,7 +3274,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0, v3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index a7e5ce3d216199..faad7e93da5d37 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1095,192 +1095,189 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT:    s_sub_u32 s4, 0, 0x12d8fb
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT:    s_mov_b32 s4, 1
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
+; GISEL-NEXT:    s_mov_b32 s5, 1
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_subb_u32 s4, 0, 0
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, 0
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, s4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s5, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, s4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v17, v5, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, s5, v6
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v19, v16
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v17, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, s4, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, s5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v13, s4, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v15, s4, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v18, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v12
-; GISEL-NEXT:    v_mul_lo_u32 v21, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v22, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v11
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v7, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v13, s4, v11
+; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s5, v6
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v16, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
+; GISEL-NEXT:    v_mul_hi_u32 v18, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v20, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v15, v11, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v9
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v21, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v22, v6, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v21, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v22
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v17
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v16
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v1, v11
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v5
+; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v5
+; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v17, v8
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v9, v4
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v12
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v8
 ; GISEL-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4

>From 8a72eaf30e1e6225e87ab00b12cd832ffa8fa555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 16 Nov 2024 19:20:34 +0100
Subject: [PATCH 2/3] address review comments

---
 .../include/llvm/Target/GlobalISel/Combine.td |  4 +-
 .../GlobalISel/CombinerHelperArtifacts.cpp    | 23 ++++----
 .../AArch64/GlobalISel/combine-overflow.mir   | 55 +++++++++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index b4f1551e965b14..8c2f23b2faf2e1 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1385,7 +1385,7 @@ def match_addos : GICombineRule<
         [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
-def match_subos : GICombineRule<
+def match_subo_no_overflow : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
   (match (wip_match_opcode G_SSUBO, G_USUBO):$root,
         [{ return Helper.matchSuboCarryOut(*${root}, ${matchinfo}); }]),
@@ -1911,7 +1911,7 @@ def cmp_combines: GICombineGroup<[
 def artifact_combines: GICombineGroup<[
   merge_combines,
   match_addos,
-  match_subos
+  match_subo_no_overflow
 ]>;
 
 // FIXME: These should use the custom predicate feature once it lands.
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 85c56ee6863a95..2caf4bbfa62cdd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -101,12 +101,15 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
       !isConstantLegalOrBeforeLegalizer(CarryTy))
     return false;
 
+  ConstantRange KBLHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                   /* IsSigned=*/Subo->isSigned());
+  ConstantRange KBRHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                   /* IsSigned=*/Subo->isSigned());
+
   if (Subo->isSigned()) {
     // G_SSUBO
-    ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
-                                                       /* IsSigned= */ true);
-    ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
-                                                       /* IsSigned= */ true);
     switch (KBLHS.signedSubMayOverflow(KBRHS)) {
     case ConstantRange::OverflowResult::MayOverflow:
       return false;
@@ -121,7 +124,9 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
     case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
       MatchInfo = [=](MachineIRBuilder &B) {
         B.buildSub(Dst, LHS, RHS);
-        B.buildConstant(Carry, 1);
+        B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
+                                              /*isVector=*/CarryTy.isVector(),
+                                              /*isFP=*/false));
       };
       return true;
     }
@@ -130,10 +135,6 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
   }
 
   // G_USUBO
-  ConstantRange KBLHS = ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
-                                                     /* IsSigned= */ false);
-  ConstantRange KBRHS = ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
-                                                     /* IsSigned= */ false);
   switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
   case ConstantRange::OverflowResult::MayOverflow:
     return false;
@@ -148,7 +149,9 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
   case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
     MatchInfo = [=](MachineIRBuilder &B) {
       B.buildSub(Dst, LHS, RHS);
-      B.buildConstant(Carry, 1);
+      B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
+                                            /*isVector=*/CarryTy.isVector(),
+                                            /*isFP=*/false));
     };
     return true;
   }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index 87b30f558539c8..20cba54923548e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -222,3 +222,58 @@ body:             |
     $w1 = COPY %o_wide
     RET_ReallyLR implicit $w0
 ...
+---
+name:            usub_may_carry_s11
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may_carry_s11
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %sub:_(s32), %o:_(s11) = G_USUBO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s11)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %sub:_(s32), %o:_(s11) = G_USUBO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s11)
+    $w0 = COPY %sub(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            usub_may_carry_s11_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: usub_may_carry_s11_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 512
+    ; CHECK-NEXT: %bv:_(<4 x s32>) = G_BUILD_VECTOR %const(s32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: %bv1:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), %const(s32)
+    ; CHECK-NEXT: %sub:_(<4 x s32>), %o:_(<4 x s11>) = G_USUBO %bv, %bv1
+    ; CHECK-NEXT: %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s11>)
+    ; CHECK-NEXT: $q0 = COPY %sub(<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY %o_wide(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w0
+    %2:_(s32) = COPY $w0
+    %3:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 512
+    %bv:_(<4 x s32>) = G_BUILD_VECTOR %const(s32), %0(s32), %1(s32), %2(s32)
+    %bv1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %const(s32)
+    %sub:_(<4 x s32>), %o:_(<4 x s11>) = G_USUBO %bv, %bv1
+    %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s11>)
+    $q0 = COPY %sub(<4 x s32>)
+    $q1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...

>From 7e4209201f8f72ce2b9a7be0fc8e7a3532ef199f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sun, 17 Nov 2024 07:34:01 +0100
Subject: [PATCH 3/3] moving and renaming

---
 .../include/llvm/Target/GlobalISel/Combine.td |  4 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 75 +++++++++++++++++++
 .../GlobalISel/CombinerHelperArtifacts.cpp    | 75 -------------------
 llvm/lib/Target/AArch64/AArch64Combine.td     |  2 +-
 4 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8c2f23b2faf2e1..230f7648fabc10 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1908,7 +1908,7 @@ def cmp_combines: GICombineGroup<[
 ]>;
 
 
-def artifact_combines: GICombineGroup<[
+def post_legalizer_combines: GICombineGroup<[
   merge_combines,
   match_addos,
   match_subo_no_overflow
@@ -1999,7 +1999,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
-    combine_use_vector_truncate, artifact_combines]>;
+    combine_use_vector_truncate, post_legalizer_combines]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 83d78c0bde399e..d95fc8cfbcf558 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7790,3 +7790,78 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
 
   return true;
 }
+
+bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) {
+  const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
+
+  Register Dst = Subo->getReg(0);
+  Register LHS = Subo->getLHSReg();
+  Register RHS = Subo->getRHSReg();
+  Register Carry = Subo->getCarryOutReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+
+  // Check legality before known bits.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
+      !isConstantLegalOrBeforeLegalizer(CarryTy))
+    return false;
+
+  ConstantRange KBLHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
+                                   /* IsSigned=*/Subo->isSigned());
+  ConstantRange KBRHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
+                                   /* IsSigned=*/Subo->isSigned());
+
+  if (Subo->isSigned()) {
+    // G_SSUBO
+    switch (KBLHS.signedSubMayOverflow(KBRHS)) {
+    case ConstantRange::OverflowResult::MayOverflow:
+      return false;
+    case ConstantRange::OverflowResult::NeverOverflows: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+        B.buildConstant(Carry, 0);
+      };
+      return true;
+    }
+    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSub(Dst, LHS, RHS);
+        B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
+                                              /*isVector=*/CarryTy.isVector(),
+                                              /*isFP=*/false));
+      };
+      return true;
+    }
+    }
+    return false;
+  }
+
+  // G_USUBO
+  switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return false;
+  case ConstantRange::OverflowResult::NeverOverflows: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildSub(Dst, LHS, RHS);
+      B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
+                                            /*isVector=*/CarryTy.isVector(),
+                                            /*isFP=*/false));
+    };
+    return true;
+  }
+  }
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 2caf4bbfa62cdd..797a1e84e21e35 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -84,78 +84,3 @@ bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
   };
   return true;
 }
-
-bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
-                                       BuildFnTy &MatchInfo) {
-  const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
-
-  Register Dst = Subo->getReg(0);
-  Register LHS = Subo->getLHSReg();
-  Register RHS = Subo->getRHSReg();
-  Register Carry = Subo->getCarryOutReg();
-  LLT DstTy = MRI.getType(Dst);
-  LLT CarryTy = MRI.getType(Carry);
-
-  // Check legality before known bits.
-  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy}}) ||
-      !isConstantLegalOrBeforeLegalizer(CarryTy))
-    return false;
-
-  ConstantRange KBLHS =
-      ConstantRange::fromKnownBits(KB->getKnownBits(LHS),
-                                   /* IsSigned=*/Subo->isSigned());
-  ConstantRange KBRHS =
-      ConstantRange::fromKnownBits(KB->getKnownBits(RHS),
-                                   /* IsSigned=*/Subo->isSigned());
-
-  if (Subo->isSigned()) {
-    // G_SSUBO
-    switch (KBLHS.signedSubMayOverflow(KBRHS)) {
-    case ConstantRange::OverflowResult::MayOverflow:
-      return false;
-    case ConstantRange::OverflowResult::NeverOverflows: {
-      MatchInfo = [=](MachineIRBuilder &B) {
-        B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
-        B.buildConstant(Carry, 0);
-      };
-      return true;
-    }
-    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
-    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
-      MatchInfo = [=](MachineIRBuilder &B) {
-        B.buildSub(Dst, LHS, RHS);
-        B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
-                                              /*isVector=*/CarryTy.isVector(),
-                                              /*isFP=*/false));
-      };
-      return true;
-    }
-    }
-    return false;
-  }
-
-  // G_USUBO
-  switch (KBLHS.unsignedSubMayOverflow(KBRHS)) {
-  case ConstantRange::OverflowResult::MayOverflow:
-    return false;
-  case ConstantRange::OverflowResult::NeverOverflows: {
-    MatchInfo = [=](MachineIRBuilder &B) {
-      B.buildSub(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
-      B.buildConstant(Carry, 0);
-    };
-    return true;
-  }
-  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
-  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
-    MatchInfo = [=](MachineIRBuilder &B) {
-      B.buildSub(Dst, LHS, RHS);
-      B.buildConstant(Carry, getICmpTrueVal(getTargetLowering(),
-                                            /*isVector=*/CarryTy.isVector(),
-                                            /*isFP=*/false));
-    };
-    return true;
-  }
-  }
-
-  return false;
-}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 23563bf9b7881f..f4a3d7e6371951 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -320,7 +320,7 @@ def AArch64PostLegalizerCombiner
                         hoist_logic_op_with_same_opcode_hands,
                         redundant_and, xor_of_and_with_same_reg,
                         extractvecelt_pairwise_add, redundant_or,
-                        mul_const, redundant_sext_inreg, artifact_combines,
+                        mul_const, redundant_sext_inreg, post_legalizer_combines,
                         form_bitfield_extract, rotate_out_of_range,
                         icmp_to_true_false_known_bits,
                         select_combines, fold_merge_to_zext,



More information about the llvm-commits mailing list