[llvm] [ValueTracking] Improve `isImpliedCondICmps` to handle binops (PR #69840)

Yingwei Zheng via llvm-commits llvm-commits at lists.llvm.org
Sat Nov 4 03:06:27 PDT 2023


https://github.com/dtcxzyw updated https://github.com/llvm/llvm-project/pull/69840

>From dd6ee62e1adf2c4400b27a12755c04a326f0268b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 21 Oct 2023 20:43:39 +0800
Subject: [PATCH 1/5] [ValueTracking] Add pre-commit tests from PR68799. NFC.

---
 .../ValueTracking/implied-icmp-binop.ll       | 223 ++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll

diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
new file mode 100644
index 000000000000000..a85214346c5a08a
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Tests from PR68799
+
+define i1 @f_and(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_and(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[AND14:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[AND1115:%.*]] = and i1 [[CMP]], [[AND14]]
+; CHECK-NEXT:    ret i1 [[AND1115]]
+;
+entry:
+  %cmp = icmp ne i32 %x, 0
+  %0 = or i32 %x, %y
+  %and14 = icmp eq i32 %0, 0
+  %and1115 = and i1 %cmp, %and14
+  ret i1 %and1115
+}
+
+define i1 @f_or(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_or(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR14:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[OR1115:%.*]] = or i1 [[CMP_NOT]], [[OR14]]
+; CHECK-NEXT:    ret i1 [[OR1115]]
+;
+entry:
+  %cmp.not = icmp eq i32 %x, 0
+  %0 = or i32 %x, %y
+  %or14 = icmp ne i32 %0, 0
+  %or1115 = or i1 %cmp.not, %or14
+  ret i1 %or1115
+}
+
+; Tests for more binops
+
+define i1 @f_add(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 7
+  %cmp = icmp ult i32 %x, 8
+  %0 = add i32 %yr, %x
+  %cmp2 = icmp ugt i32 %0, 16
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+define i1 @f_add_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add_nsw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 2147483647
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[YR]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 2147483647
+  %cmp = icmp sgt i32 %x, 5
+  %0 = add nsw i32 %yr, %x
+  %cmp2 = icmp slt i32 %0, 5
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+define i1 @f_add_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add_nuw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %cmp = icmp ugt i32 %x, 1
+  %0 = add nuw i32 %x, %y
+  %cmp2 = icmp eq i32 %0, 1
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+define i1 @f_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_sub_nsw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 2147483647
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 [[X]], [[YR]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 2147483647
+  %cmp = icmp slt i32 %x, 5
+  %0 = sub nsw i32 %x, %yr
+  %cmp2 = icmp slt i32 %0, 5
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+define i1 @f_sub_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_sub_nuw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nuw i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 6
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %cmp = icmp ult i32 %x, 5
+  %0 = sub nuw i32 %x, %y
+  %cmp2 = icmp eq i32 %0, 6
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+; Negative tests
+
+; non-constant range
+define i1 @f_add_nofold1(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: define i1 @f_add_nofold1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], [[Z]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 7
+  %cmp = icmp ult i32 %x, %z
+  %0 = add i32 %yr, %x
+  %cmp2 = icmp ugt i32 %0, 16
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+define i1 @f_add_nofold2(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: define i1 @f_add_nofold2(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], [[Z]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 7
+  %cmp = icmp ult i32 %x, 8
+  %0 = add i32 %yr, %x
+  %cmp2 = icmp ugt i32 %0, %z
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+; narrower range
+define i1 @f_add_nofold3(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add_nofold3(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 7
+  %cmp = icmp ult i32 %x, 8
+  %0 = add i32 %yr, %x
+  %cmp2 = icmp ugt i32 %0, 10
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}
+
+; sub is not commutative
+define i1 @f_sub_nsw_nofold(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_sub_nsw_nofold(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 2147483647
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 [[YR]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %yr = and i32 %y, 2147483647
+  %cmp = icmp slt i32 %x, 5
+  %0 = sub nsw i32 %yr, %x
+  %cmp2 = icmp slt i32 %0, 5
+  %and = and i1 %cmp, %cmp2
+  ret i1 %and
+}

>From 5bba0346e36c77663fa77fd06d0495e7e80fc069 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 21 Oct 2023 20:49:31 +0800
Subject: [PATCH 2/5] [ValueTracking] Improve `isImpliedCondICmps` to handle
 binops

---
 llvm/lib/Analysis/ValueTracking.cpp           | 48 ++++++++++++++++---
 .../ValueTracking/implied-icmp-binop.ll       | 44 +++--------------
 llvm/test/Transforms/InstCombine/icmp-or.ll   | 18 ++-----
 .../icmp-power2-and-icmp-shifted-mask.ll      | 40 ++++------------
 4 files changed, 60 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2458c1cb9f8ec1d..8e3577ea8a56482 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8253,14 +8253,11 @@ isImpliedCondMatchingOperands(CmpInst::Predicate LPred,
   return std::nullopt;
 }
 
-/// Return true if "icmp LPred X, LC" implies "icmp RPred X, RC" is true.
-/// Return false if "icmp LPred X, LC" implies "icmp RPred X, RC" is false.
+/// Return true if `X in DomCR` implies `X in CR` is true.
+/// Return false if `X in DomCR` implies `X in CR` is false.
 /// Otherwise, return std::nullopt if we can't infer anything.
-static std::optional<bool> isImpliedCondCommonOperandWithConstants(
-    CmpInst::Predicate LPred, const APInt &LC, CmpInst::Predicate RPred,
-    const APInt &RC) {
-  ConstantRange DomCR = ConstantRange::makeExactICmpRegion(LPred, LC);
-  ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, RC);
+static std::optional<bool> isImpliedCondWithRange(const ConstantRange &DomCR,
+                                                  const ConstantRange &CR) {
   ConstantRange Intersection = DomCR.intersectWith(CR);
   ConstantRange Difference = DomCR.difference(CR);
   if (Intersection.isEmptySet())
@@ -8270,6 +8267,17 @@ static std::optional<bool> isImpliedCondCommonOperandWithConstants(
   return std::nullopt;
 }
 
+/// Return true if "icmp LPred X, LC" implies "icmp RPred X, RC" is true.
+/// Return false if "icmp LPred X, LC" implies "icmp RPred X, RC" is false.
+/// Otherwise, return std::nullopt if we can't infer anything.
+static std::optional<bool> isImpliedCondCommonOperandWithConstants(
+    CmpInst::Predicate LPred, const APInt &LC, CmpInst::Predicate RPred,
+    const APInt &RC) {
+  ConstantRange DomCR = ConstantRange::makeExactICmpRegion(LPred, LC);
+  ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, RC);
+  return isImpliedCondWithRange(DomCR, CR);
+}
+
 /// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1")
 /// is true.  Return false if LHS implies RHS is false. Otherwise, return
 /// std::nullopt if we can't infer anything.
@@ -8320,6 +8328,32 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
       return LPred == RPred;
   }
 
+  // handle R0 = L0 binop V
+  Value *R0Op1 = nullptr;
+  if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)) &&
+      match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) {
+    ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC);
+    ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC);
+    // TODO: use contextual information from SimplifyQuery
+    ConstantRange RHSRange = computeConstantRange(
+        R0Op1, ICmpInst::isSigned(RPred), /*UseInstrInfo*/ true, /*AC*/ nullptr,
+        /*CtxI*/ nullptr, /*DT*/ nullptr, Depth);
+    auto BO = cast<BinaryOperator>(R0);
+    if (BO->getOperand(0) != L0)
+      std::swap(LHSRange, RHSRange);
+    unsigned NoWrapKind = 0;
+    if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
+      if (OBO->hasNoUnsignedWrap())
+        NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap;
+      if (OBO->hasNoSignedWrap())
+        NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
+    }
+    ConstantRange Range =
+        LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind);
+    if (auto Res = isImpliedCondWithRange(Range, CR))
+      return Res;
+  }
+
   if (LPred == RPred)
     return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
 
diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
index a85214346c5a08a..882c38f329bd884 100644
--- a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
@@ -7,11 +7,7 @@ define i1 @f_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_and(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[AND14:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    [[AND1115:%.*]] = and i1 [[CMP]], [[AND14]]
-; CHECK-NEXT:    ret i1 [[AND1115]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp = icmp ne i32 %x, 0
@@ -25,11 +21,7 @@ define i1 @f_or(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_or(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[OR14:%.*]] = icmp ne i32 [[TMP0]], 0
-; CHECK-NEXT:    [[OR1115:%.*]] = or i1 [[CMP_NOT]], [[OR14]]
-; CHECK-NEXT:    ret i1 [[OR1115]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.not = icmp eq i32 %x, 0
@@ -45,12 +37,7 @@ define i1 @f_add(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_add(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 7
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 8
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[YR]], [[X]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %yr = and i32 %y, 7
@@ -65,12 +52,7 @@ define i1 @f_add_nsw(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_add_nsw(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 2147483647
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[YR]], [[X]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %yr = and i32 %y, 2147483647
@@ -85,11 +67,7 @@ define i1 @f_add_nuw(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_add_nuw(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp = icmp ugt i32 %x, 1
@@ -103,12 +81,8 @@ define i1 @f_sub_nsw(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_sub_nsw(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[YR:%.*]] = and i32 [[Y]], 2147483647
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 [[X]], [[YR]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
 entry:
   %yr = and i32 %y, 2147483647
@@ -123,11 +97,7 @@ define i1 @f_sub_nuw(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i1 @f_sub_nuw(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = sub nuw i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 6
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp = icmp ult i32 %x, 5
diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll
index 922845c1e7e2d82..a96341f31132943 100644
--- a/llvm/test/Transforms/InstCombine/icmp-or.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-or.ll
@@ -430,13 +430,8 @@ define i1 @icmp_or_xor_2_ne_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
 
 define i1 @icmp_or_xor_2_3_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
 ; CHECK-LABEL: @icmp_or_xor_2_3_fail(
-; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[X1:%.*]], [[Y1:%.*]]
-; CHECK-NEXT:    [[XOR1:%.*]] = xor i64 [[X2:%.*]], [[Y2:%.*]]
-; CHECK-NEXT:    [[OR:%.*]] = or i64 [[XOR]], [[XOR1]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[OR]], 0
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[XOR]], 0
-; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[CMP]], [[CMP_1]]
-; CHECK-NEXT:    ret i1 [[OR1]]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[X1:%.*]], [[Y1:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP_1]]
 ;
   %xor = xor i64 %x1, %y1
   %xor1 = xor i64 %x2, %y2
@@ -451,13 +446,8 @@ define i1 @icmp_or_xor_2_3_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
 
 define i1 @icmp_or_xor_2_4_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
 ; CHECK-LABEL: @icmp_or_xor_2_4_fail(
-; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[X1:%.*]], [[Y1:%.*]]
-; CHECK-NEXT:    [[XOR1:%.*]] = xor i64 [[X2:%.*]], [[Y2:%.*]]
-; CHECK-NEXT:    [[OR:%.*]] = or i64 [[XOR]], [[XOR1]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[OR]], 0
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[XOR1]], 0
-; CHECK-NEXT:    [[OR1:%.*]] = or i1 [[CMP]], [[CMP_1]]
-; CHECK-NEXT:    ret i1 [[OR1]]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[X2:%.*]], [[Y2:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP_1]]
 ;
   %xor = xor i64 %x1, %y1
   %xor1 = xor i64 %x2, %y2
diff --git a/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll b/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll
index 82fcca07a00ac66..27ecc5686066cf1 100644
--- a/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll
@@ -250,10 +250,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_256_239_gap_in_mask_fail(i3
 define i1 @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 112
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 112
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 112
@@ -265,10 +262,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(i32 %x) {
 define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 112
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 112
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 112
@@ -281,10 +275,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(i32
 define i1 @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 56
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 56
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 56
@@ -296,10 +287,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(i32 %x) {
 define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 56
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 56
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 56
@@ -312,10 +300,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(i32
 define i1 @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 24
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 24
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 24
@@ -327,10 +312,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(i32 %x) {
 define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 24
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 24
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 24
@@ -343,10 +325,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(i32
 define i1 @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 12
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 12
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 12
@@ -358,10 +337,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(i32 %x) {
 define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_12_mask_overlap_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_12_mask_overlap_fail(
 ; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[T2:%.*]] = and i32 [[X]], 12
-; CHECK-NEXT:    [[T3:%.*]] = icmp ne i32 [[T2]], 12
-; CHECK-NEXT:    [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT:    ret i1 [[T4]]
+; CHECK-NEXT:    ret i1 [[T1]]
 ;
   %t1 = icmp ult i32 %x, 8
   %t2 = and i32 %x, 12

>From 4d2df870f4bbdcedd512a4862bbe7873e7e58954 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 22 Oct 2023 03:45:15 +0800
Subject: [PATCH 3/5] fixup! [ValueTracking] Improve `isImpliedCondICmps` to
 handle binops

---
 llvm/lib/Analysis/ValueTracking.cpp           |  58 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            | 593 +++++++--------
 llvm/test/CodeGen/AMDGPU/srem64.ll            | 678 ++++++++----------
 llvm/test/CodeGen/AMDGPU/udiv64.ll            | 654 ++++++++---------
 llvm/test/CodeGen/AMDGPU/urem64.ll            | 534 +++++++-------
 llvm/test/CodeGen/PowerPC/tail-dup-layout.ll  |  10 +-
 .../CodeGen/Thumb2/mve-float16regloops.ll     |  48 +-
 .../CodeGen/Thumb2/mve-float32regloops.ll     |  60 +-
 8 files changed, 1201 insertions(+), 1434 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8e3577ea8a56482..255298c01185450 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8302,8 +8302,36 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
   // Can we infer anything when the 0-operands match and the 1-operands are
   // constants (not necessarily matching)?
   const APInt *LC, *RC;
-  if (L0 == R0 && match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)))
-    return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
+  if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC))) {
+    if (L0 == R0)
+      return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
+
+    // handle R0 = L0 binop V and R0 = V binop L0
+    Value *R0Op1 = nullptr;
+    if (match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) {
+      ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC);
+      ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC);
+      // TODO: use contextual information from SimplifyQuery
+      ConstantRange RHSRange =
+          computeConstantRange(R0Op1, ICmpInst::isSigned(RPred),
+                               /*UseInstrInfo*/ true, /*AC*/ nullptr,
+                               /*CtxI*/ nullptr, /*DT*/ nullptr, Depth);
+      auto *BO = cast<BinaryOperator>(R0);
+      if (BO->getOperand(0) != L0)
+        std::swap(LHSRange, RHSRange);
+      unsigned NoWrapKind = 0;
+      if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
+        if (OBO->hasNoUnsignedWrap())
+          NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap;
+        if (OBO->hasNoSignedWrap())
+          NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
+      }
+      ConstantRange Range =
+          LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind);
+      if (auto Res = isImpliedCondWithRange(Range, CR))
+        return Res;
+    }
+  }
 
   // L0 = R0 = L1 + R1, L0 >=u L1 implies R0 >=u R1, L0 <u L1 implies R0 <u R1
   if (ICmpInst::isUnsigned(LPred) && ICmpInst::isUnsigned(RPred)) {
@@ -8328,32 +8356,6 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
       return LPred == RPred;
   }
 
-  // handle R0 = L0 binop V
-  Value *R0Op1 = nullptr;
-  if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)) &&
-      match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) {
-    ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC);
-    ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC);
-    // TODO: use contextual information from SimplifyQuery
-    ConstantRange RHSRange = computeConstantRange(
-        R0Op1, ICmpInst::isSigned(RPred), /*UseInstrInfo*/ true, /*AC*/ nullptr,
-        /*CtxI*/ nullptr, /*DT*/ nullptr, Depth);
-    auto BO = cast<BinaryOperator>(R0);
-    if (BO->getOperand(0) != L0)
-      std::swap(LHSRange, RHSRange);
-    unsigned NoWrapKind = 0;
-    if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
-      if (OBO->hasNoUnsignedWrap())
-        NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap;
-      if (OBO->hasNoSignedWrap())
-        NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
-    }
-    ConstantRange Range =
-        LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind);
-    if (auto Res = isImpliedCondWithRange(Range, CR))
-      return Res;
-  }
-
   if (LPred == RPred)
     return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
 
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 9cb6842ae0a1827..950e8c60ef9d01f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -142,7 +142,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
@@ -156,16 +155,16 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s13
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[8:9]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT:    s_min_u32 s18, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s16, s14, s18
+; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s12
+; GCN-IR-NEXT:    s_add_i32 s9, s9, 32
+; GCN-IR-NEXT:    s_min_u32 s18, s9, s14
+; GCN-IR-NEXT:    s_sub_u32 s16, s8, s18
 ; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[16:17], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
@@ -174,27 +173,21 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
 ; GCN-IR-NEXT:    s_or_b64 s[20:21], s[20:21], s[22:23]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s20, s16, 1
-; GCN-IR-NEXT:    s_addc_u32 s21, s17, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[20:21], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
+; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s20
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s17, s16, 1
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s17
 ; GCN-IR-NEXT:    s_add_u32 s19, s6, -1
 ; GCN-IR-NEXT:    s_addc_u32 s20, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s18
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[12:13], s[8:9]
+; GCN-IR-NEXT:    s_add_u32 s12, s12, s18
+; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT:  .LBB0_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
@@ -214,11 +207,11 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[22:23]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT:  .LBB0_4: ; %Flow7
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB0_4: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[0:1]
 ; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
@@ -372,86 +365,75 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
-; GCN-IR-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v2
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v12, v2, v3
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v10
-; GCN-IR-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v2
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v11
-; GCN-IR-NEXT:    v_min_u32_e32 v13, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[6:7], v12, v13
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[2:3]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_min_u32_e32 v2, v2, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v10
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v11
+; GCN-IR-NEXT:    v_min_u32_e32 v12, v3, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v13, vcc, v2, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v14, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[13:14]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[13:14]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v10, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v9, v12
-; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[10:11], v14
-; GCN-IR-NEXT:    v_not_b32_e32 v8, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v9, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v13
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v13
+; GCN-IR-NEXT:    v_add_i32_e32 v17, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v18, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[10:11], v8
+; GCN-IR-NEXT:    v_lshr_b64 v[13:14], v[10:11], v14
+; GCN-IR-NEXT:    v_not_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v2, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT:  .LBB1_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v8
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v16, v14
-; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v17, v15, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[13:14], v[13:14], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v13, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v17, v12
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v18, v14, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v15, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v13
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v13, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v14, v12
-; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v15, v13, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v9
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v0
-; GCN-IR-NEXT:  .LBB1_6: ; %Flow5
+; GCN-IR-NEXT:    v_sub_i32_e64 v13, s[4:5], v12, v13
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v16, v9
+; GCN-IR-NEXT:    v_subb_u32_e64 v14, s[4:5], v14, v15, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v16, v3
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v2
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v2, v8
+; GCN-IR-NEXT:  .LBB1_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v4
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v7, v6
 ; GCN-IR-NEXT:    v_xor_b32_e32 v3, v8, v0
@@ -971,7 +953,6 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-LABEL: s_test_sdiv24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[4:5], 24
@@ -993,16 +974,16 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s13
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT:    s_min_u32 s18, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s16, s14, s18
+; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s12
+; GCN-IR-NEXT:    s_add_i32 s9, s9, 32
+; GCN-IR-NEXT:    s_min_u32 s18, s9, s14
+; GCN-IR-NEXT:    s_sub_u32 s16, s8, s18
 ; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[16:17], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
@@ -1011,27 +992,21 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
 ; GCN-IR-NEXT:    s_or_b64 s[20:21], s[20:21], s[22:23]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s20, s16, 1
-; GCN-IR-NEXT:    s_addc_u32 s21, s17, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[20:21], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
+; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s20
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s17, s16, 1
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s17
 ; GCN-IR-NEXT:    s_add_u32 s19, s6, -1
 ; GCN-IR-NEXT:    s_addc_u32 s20, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s18
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[12:13], s[8:9]
+; GCN-IR-NEXT:    s_add_u32 s12, s12, s18
+; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT:  .LBB9_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
@@ -1051,11 +1026,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[22:23]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB9_4: ; %udiv-end
 ; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[0:1]
@@ -1196,7 +1171,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-LABEL: s_test_sdiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-IR-NEXT:    s_mov_b32 s5, s4
@@ -1206,61 +1181,54 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_add_u32 s12, s10, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s13, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[12:13], 63
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[14:15]
-; GCN-IR-NEXT:    s_and_b64 s[8:9], s[14:15], exec
-; GCN-IR-NEXT:    s_cselect_b32 s8, 0, 24
+; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s12, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[6:7], s[14:15]
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[14:15], exec
+; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s11, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s11
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s14
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s6, s10, 1
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s10
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s6
 ; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_sub_u32 s12, 58, s12
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:  .LBB10_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT:    s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT:    s_subb_u32 s13, s13, s15
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT:    s_ashr_i32 s8, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s9, s8
+; GCN-IR-NEXT:    s_and_b32 s6, s8, 1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s14, s14, s8
+; GCN-IR-NEXT:    s_subb_u32 s15, s15, s9
+; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
+; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_3
-; GCN-IR-NEXT:  .LBB10_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[2:3]
-; GCN-IR-NEXT:  .LBB10_5: ; %udiv-end
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT:  .LBB10_4: ; %udiv-end
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s4, s6, s4
 ; GCN-IR-NEXT:    s_subb_u32 s5, s7, s5
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
@@ -1388,82 +1356,72 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v0, v1
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v5, vcc, s6, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, s6, v8
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, 24, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
+; GCN-IR-NEXT:    s_cbranch_execz .LBB11_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB11_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v9
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], 24, v6
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:  .LBB11_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB11_5: ; %Flow4
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v1
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v0
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v6, v0
-; GCN-IR-NEXT:  .LBB11_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
+; GCN-IR-NEXT:  .LBB11_4: ; %Flow
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v7, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1620,45 +1578,43 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v9
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:  .LBB12_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB12_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v6, v0
-; GCN-IR-NEXT:  .LBB12_6: ; %Flow5
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v0
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v7, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v3
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
+; GCN-IR-NEXT:  .LBB12_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1683,81 +1639,70 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v0, v2
-; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v7
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v6
 ; GCN-IR-NEXT:    v_add_i32_e64 v0, s[4:5], 32, v0
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v8
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v7
 ; GCN-IR-NEXT:    v_min_u32_e32 v0, v0, v1
-; GCN-IR-NEXT:    v_sub_i32_e64 v3, s[4:5], 48, v0
-; GCN-IR-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, 0, s[4:5]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[3:4]
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], 48, v0
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v7, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB13_6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB13_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v3, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[7:8], v3
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB13_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[9:10], v[7:8], v9
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 0xffffffcf, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[6:7], v4
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[6:7], v9
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_addc_u32_e64 v8, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:  .LBB13_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[9:10], v[9:10], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v4
-; GCN-IR-NEXT:    v_or_b32_e32 v0, v9, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, s12, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v10, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v7
-; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[3:4], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_and_b32_e32 v5, 1, v9
-; GCN-IR-NEXT:    v_and_b32_e32 v9, 0x8000, v9
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v9, s[4:5], v0, v9
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB13_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[3:4], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:  .LBB13_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v0
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s10, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v1
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v0
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v3, v6, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v0, v4
+; GCN-IR-NEXT:  .LBB13_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v5, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 520ec6e24ae3bfe..0191930aabba5a1 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -124,73 +124,66 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT:    s_min_u32 s6, s10, s11
+; GCN-IR-NEXT:    s_add_i32 s7, s7, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s7, s10
+; GCN-IR-NEXT:    s_sub_u32 s14, s6, s10
+; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[16:17]
 ; GCN-IR-NEXT:    s_and_b64 s[8:9], s[16:17], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s16
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s11, s14, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s14
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s11
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
-; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_add_u32 s10, s6, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[18:19], s[6:7]
+; GCN-IR-NEXT:    s_add_u32 s10, s18, s10
+; GCN-IR-NEXT:    s_addc_u32 s11, s19, 0
+; GCN-IR-NEXT:  .LBB0_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT:    s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT:    s_subb_u32 s13, s13, s15
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT:    s_ashr_i32 s12, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s13, s12
+; GCN-IR-NEXT:    s_and_b32 s6, s12, 1
+; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_sub_u32 s14, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s15, s15, s13
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT:  .LBB0_4: ; %Flow7
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB0_4: ; %udiv-end
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s12, s0
@@ -349,85 +342,74 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v2
-; GCN-IR-NEXT:    v_add_i32_e64 v6, s[6:7], 32, v6
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 32, v6
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v3
 ; GCN-IR-NEXT:    v_min_u32_e32 v10, v6, v7
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v6, v0
-; GCN-IR-NEXT:    v_add_i32_e64 v6, s[6:7], 32, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 32, v6
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v11, v6, v7
-; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[6:7], v10, v11
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT:    v_min_u32_e32 v6, v6, v7
+; GCN-IR-NEXT:    v_sub_i32_e32 v11, vcc, v10, v6
+; GCN-IR-NEXT:    v_subb_u32_e64 v12, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[11:12]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, v4
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[11:12]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v6
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 63, v6
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v11
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v11
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v9, v10
+; GCN-IR-NEXT:    v_not_b32_e32 v10, v10
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[0:1], v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT:    v_not_b32_e32 v8, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v9, v11
+; GCN-IR-NEXT:    v_not_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v10, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT:  .LBB1_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
-; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v16, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v17, v13, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v14, v6
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v16, v12
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v17, v13, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v14, v8
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v15, v7
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v15, v9
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v14
 ; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
 ; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v14
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v15, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v7
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v6
-; GCN-IR-NEXT:  .LBB1_6: ; %Flow5
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v6, v8
+; GCN-IR-NEXT:  .LBB1_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_mul_lo_u32 v6, v2, v9
 ; GCN-IR-NEXT:    v_mul_hi_u32 v7, v2, v8
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v8
@@ -1013,7 +995,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 31
@@ -1029,69 +1011,62 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_subb_u32 s9, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s3
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s9
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s16, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT:    s_min_u32 s6, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT:    s_add_i32 s7, s7, 32
+; GCN-IR-NEXT:    s_min_u32 s12, s7, s12
+; GCN-IR-NEXT:    s_sub_u32 s16, s6, s12
+; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[16:17], 63
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
 ; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
-; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s18
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s13, s16, 1
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[2:3], s13
 ; GCN-IR-NEXT:    s_add_u32 s18, s8, -1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s9, -1
-; GCN-IR-NEXT:    s_not_b64 s[6:7], s[12:13]
-; GCN-IR-NEXT:    s_add_u32 s12, s6, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[20:21], s[6:7]
+; GCN-IR-NEXT:    s_add_u32 s12, s20, s12
+; GCN-IR-NEXT:    s_addc_u32 s13, s21, 0
+; GCN-IR-NEXT:  .LBB8_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s6, s18, s14
-; GCN-IR-NEXT:    s_subb_u32 s6, s19, s15
-; GCN-IR-NEXT:    s_ashr_i32 s16, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s6, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s6, s18, s16
+; GCN-IR-NEXT:    s_subb_u32 s6, s19, s17
+; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s15, s14
+; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s16, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_3
-; GCN-IR-NEXT:  .LBB8_4: ; %Flow7
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GCN-IR-NEXT:  .LBB8_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB8_4: ; %udiv-end
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GCN-IR-NEXT:    s_mul_i32 s11, s8, s11
@@ -1158,7 +1133,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-LABEL: s_test_srem24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s7
@@ -1180,69 +1155,62 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s5
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s4
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s5
-; GCN-IR-NEXT:    s_min_u32 s16, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s4
+; GCN-IR-NEXT:    s_add_i32 s9, s9, 32
+; GCN-IR-NEXT:    s_min_u32 s12, s9, s12
+; GCN-IR-NEXT:    s_sub_u32 s16, s8, s12
+; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[16:17], 63
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
 ; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s5
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s4
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
-; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[4:5], s14
+; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[4:5], s18
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s13, s16, 1
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[4:5], s13
 ; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[20:21], s[8:9]
+; GCN-IR-NEXT:    s_add_u32 s12, s20, s12
+; GCN-IR-NEXT:    s_addc_u32 s13, s21, 0
+; GCN-IR-NEXT:  .LBB9_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s14
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s15
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s8, s18, s16
+; GCN-IR-NEXT:    s_subb_u32 s8, s19, s17
+; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
+; GCN-IR-NEXT:    s_mov_b32 s15, s14
+; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s16, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB9_4: ; %udiv-end
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
@@ -1386,76 +1354,69 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-LABEL: s_test_srem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s8, s3, 31
-; GCN-IR-NEXT:    s_mov_b32 s9, s8
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s4, s2, s8
-; GCN-IR-NEXT:    s_subb_u32 s5, s3, s8
+; GCN-IR-NEXT:    s_ashr_i32 s6, s3, 31
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s4, s2, s6
+; GCN-IR-NEXT:    s_subb_u32 s5, s3, s6
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s4
 ; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s5
-; GCN-IR-NEXT:    s_min_u32 s8, s2, s3
-; GCN-IR-NEXT:    s_add_u32 s2, s8, 0xffffffc5
+; GCN-IR-NEXT:    s_min_u32 s10, s2, s3
+; GCN-IR-NEXT:    s_add_u32 s2, s10, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[2:3], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[2:3], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[12:13], exec
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, 24
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s2, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s3, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    s_add_i32 s6, s2, 1
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], 24, s2
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s6
 ; GCN-IR-NEXT:    s_add_u32 s14, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s5, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT:  .LBB10_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s6, s3, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s6, s14, s10
-; GCN-IR-NEXT:    s_subb_u32 s6, s15, s11
-; GCN-IR-NEXT:    s_ashr_i32 s12, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s6, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT:    s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s6, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s6, s15, s13
+; GCN-IR-NEXT:    s_ashr_i32 s8, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s9, s8
+; GCN-IR-NEXT:    s_and_b32 s6, s8, 1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s8
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[10:11], 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_3
-; GCN-IR-NEXT:  .LBB10_4: ; %Flow6
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[2:3]
-; GCN-IR-NEXT:  .LBB10_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT:  .LBB10_4: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s6, s4, s11
-; GCN-IR-NEXT:    s_mul_i32 s5, s5, s10
-; GCN-IR-NEXT:    s_mul_i32 s4, s4, s10
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    s_mul_i32 s7, s4, s7
+; GCN-IR-NEXT:    s_mul_i32 s5, s5, s6
+; GCN-IR-NEXT:    s_mul_i32 s4, s4, s6
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
 ; GCN-IR-NEXT:    v_sub_i32_e64 v0, vcc, 24, s4
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
@@ -1584,78 +1545,68 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, 24, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, 24, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
+; GCN-IR-NEXT:    s_cbranch_execz .LBB11_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB11_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v7
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v4
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:  .LBB11_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
 ; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB11_5: ; %Flow4
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v3
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v4, v6
-; GCN-IR-NEXT:  .LBB11_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-IR-NEXT:  .LBB11_4: ; %Flow
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v5
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v5
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v5
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1774,9 +1725,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    s_movk_i32 s8, 0xffd0
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s8, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
@@ -1803,52 +1754,50 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:  .LBB12_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
 ; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB12_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v4, v6
-; GCN-IR-NEXT:  .LBB12_6: ; %Flow5
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v3
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v5
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v5
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v5
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-IR-NEXT:  .LBB12_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1877,75 +1826,64 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v0
-; GCN-IR-NEXT:    v_add_i32_e64 v3, s[4:5], 32, v3
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v3, v4
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GCN-IR-NEXT:    v_add_i32_e64 v4, s[4:5], 32, v4
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GCN-IR-NEXT:    v_min_u32_e32 v4, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], 48, v4
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB13_6
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB13_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB13_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v9
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 0xffffffcf, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:  .LBB13_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, s12, v10
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s10, v10
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v12, 0x8000, v12
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v13, v7
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
 ; GCN-IR-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB13_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT:  .LBB13_6: ; %Flow5
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v4, v6
+; GCN-IR-NEXT:  .LBB13_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[6:7], 15
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index e809292aad1d38b..6fc0e90b7724460 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -129,15 +129,15 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT:    s_min_u32 s6, s10, s11
+; GCN-IR-NEXT:    s_add_i32 s7, s7, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s3
+; GCN-IR-NEXT:    s_min_u32 s14, s7, s10
+; GCN-IR-NEXT:    s_sub_u32 s12, s6, s14
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
@@ -146,28 +146,21 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s16
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s13, s12, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s12
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s13
 ; GCN-IR-NEXT:    s_add_u32 s15, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s16, s5, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
+; GCN-IR-NEXT:    s_not_b64 s[2:3], s[6:7]
 ; GCN-IR-NEXT:    s_add_u32 s2, s2, s14
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT:  .LBB0_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
@@ -187,11 +180,11 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[2:3], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT:  .LBB0_4: ; %Flow7
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[2:3]
-; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB0_4: ; %udiv-end
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
@@ -316,86 +309,75 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-LABEL: v_test_udiv_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
-; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v6, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v9, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[6:7], v8, v9
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v1, 0, s[4:5]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v6, v8
+; GCN-IR-NEXT:    v_subb_u32_e64 v10, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v6
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v6
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v9
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v9
+; GCN-IR-NEXT:    v_add_i32_e32 v13, vcc, -1, v2
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v2
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v10
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v0, v8
+; GCN-IR-NEXT:    v_lshr_b64 v[9:10], v[0:1], v10
+; GCN-IR-NEXT:    v_addc_u32_e32 v14, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v0, v6
 ; GCN-IR-NEXT:    v_not_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:  .LBB1_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[9:10], v[9:10], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v9, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v13, v8
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v14, v10, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v9, v5
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v9, v8, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v11, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v9
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v9, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v9, v9, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v9, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v6, v0
-; GCN-IR-NEXT:  .LBB1_6: ; %Flow5
+; GCN-IR-NEXT:    v_sub_i32_e64 v9, s[4:5], v8, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v12, v5
+; GCN-IR-NEXT:    v_subb_u32_e64 v10, s[4:5], v10, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v7
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, v5
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT:  .LBB1_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, %y
   ret i64 %result
@@ -784,7 +766,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-LABEL: s_test_udiv24_i48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_and_b32 s3, s5, 0xffff
 ; GCN-IR-NEXT:    s_and_b32 s2, s4, 0xff000000
@@ -796,16 +777,16 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s9
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
 ; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s4, s5
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s8
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s9
-; GCN-IR-NEXT:    s_min_u32 s14, s4, s5
-; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_min_u32 s4, s4, s5
+; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s8
+; GCN-IR-NEXT:    s_add_i32 s5, s5, 32
+; GCN-IR-NEXT:    s_min_u32 s14, s5, s10
+; GCN-IR-NEXT:    s_sub_u32 s12, s4, s14
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
@@ -814,27 +795,21 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s9
 ; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s8
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], s12
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[8:9], s16
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s13, s12, 1
+; GCN-IR-NEXT:    s_sub_i32 s6, 63, s12
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], s6
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[8:9], s13
 ; GCN-IR-NEXT:    s_add_u32 s15, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s16, s3, -1
-; GCN-IR-NEXT:    s_not_b64 s[4:5], s[10:11]
-; GCN-IR-NEXT:    s_add_u32 s8, s4, s14
-; GCN-IR-NEXT:    s_addc_u32 s9, s5, 0
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
-; GCN-IR-NEXT:  .LBB7_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_add_u32 s8, s8, s14
+; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-IR-NEXT:  .LBB7_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
@@ -854,11 +829,11 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[8:9], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
-; GCN-IR-NEXT:  .LBB7_4: ; %Flow4
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-IR-NEXT:  .LBB7_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB7_4: ; %udiv-end
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
@@ -984,69 +959,62 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-LABEL: s_test_udiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
-; GCN-IR-NEXT:    s_add_u32 s10, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 s[6:7], s[12:13], exec
-; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
+; GCN-IR-NEXT:    s_min_u32 s10, s8, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s10, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[4:5], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s9
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s12
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s4, s8, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s8
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s4
 ; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
-; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT:  .LBB8_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s4, s14, s10
-; GCN-IR-NEXT:    s_subb_u32 s4, s15, s11
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT:    s_ashr_i32 s6, s4, 31
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_and_b32 s4, s6, 1
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s6
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, s7
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[10:11], 0
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_3
-; GCN-IR-NEXT:  .LBB8_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-IR-NEXT:  .LBB8_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[2:3]
+; GCN-IR-NEXT:  .LBB8_4: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 24, %x
@@ -1157,13 +1125,15 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v6, v0, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0x8000
@@ -1192,45 +1162,41 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:  .LBB9_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v0, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB9_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
-; GCN-IR-NEXT:  .LBB9_6: ; %Flow5
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v0
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-IR-NEXT:    v_lshl_b64 v[1:2], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v0, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:  .LBB9_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 32768, %x
   ret i64 %result
@@ -1250,73 +1216,62 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 48, v6
-; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_min_u32_e32 v4, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 48, v4
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB10_6
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v1, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB10_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB10_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffcf, v6
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[0:1], v7
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffcf, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:  .LBB10_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v7, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v6
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v7
-; GCN-IR-NEXT:    v_and_b32_e32 v7, 0x8000, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 0x8000, v8
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
-; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v6, v7
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
-; GCN-IR-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB10_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB10_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
-; GCN-IR-NEXT:  .LBB10_6: ; %Flow5
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB10_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT:  .LBB10_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 32768
   ret i64 %result
@@ -1405,66 +1360,60 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-LABEL: s_test_udiv_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s10, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s8, 59, s10
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[6:7], s[8:9], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s3
-; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s2
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s8
+; GCN-IR-NEXT:    s_min_u32 s6, s6, s7
+; GCN-IR-NEXT:    s_sub_u32 s10, 59, s6
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[2:3], s12
-; GCN-IR-NEXT:    s_add_u32 s2, s10, 0xffffffc4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s11, s10, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s10
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s11
+; GCN-IR-NEXT:    s_add_u32 s2, s6, 0xffffffc4
 ; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
-; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
+; GCN-IR-NEXT:  .LBB11_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s4, 23, s8
-; GCN-IR-NEXT:    s_subb_u32 s4, 0, s9
-; GCN-IR-NEXT:    s_ashr_i32 s10, s4, 31
-; GCN-IR-NEXT:    s_and_b32 s4, s10, 1
-; GCN-IR-NEXT:    s_and_b32 s10, s10, 24
-; GCN-IR-NEXT:    s_sub_u32 s8, s8, s10
-; GCN-IR-NEXT:    s_subb_u32 s9, s9, 0
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, 23, s10
+; GCN-IR-NEXT:    s_subb_u32 s4, 0, s11
+; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 31
+; GCN-IR-NEXT:    s_and_b32 s6, s4, 1
+; GCN-IR-NEXT:    s_and_b32 s4, s4, 24
+; GCN-IR-NEXT:    s_sub_u32 s10, s10, s4
+; GCN-IR-NEXT:    s_subb_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_add_u32 s2, s2, 1
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_3
-; GCN-IR-NEXT:  .LBB11_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-IR-NEXT:  .LBB11_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[2:3]
+; GCN-IR-NEXT:  .LBB11_4: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, 24
@@ -1551,72 +1500,61 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 59, v6
-; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_min_u32_e32 v4, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 59, v4
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v1, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB12_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 63, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc4, v6
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[0:1], v7
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc4, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
+; GCN-IR-NEXT:  .LBB12_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v7, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
 ; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 23, v6
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v7
-; GCN-IR-NEXT:    v_and_b32_e32 v7, 24, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 24, v8
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
-; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v6, v7
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
-; GCN-IR-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB12_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
-; GCN-IR-NEXT:  .LBB12_6: ; %Flow5
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT:  .LBB12_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 24
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 9c316612528c208..e91053384b3cec0 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -124,73 +124,66 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT:    s_min_u32 s6, s10, s11
+; GCN-IR-NEXT:    s_add_i32 s7, s7, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s7, s10
+; GCN-IR-NEXT:    s_sub_u32 s14, s6, s10
+; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[16:17]
 ; GCN-IR-NEXT:    s_and_b64 s[8:9], s[16:17], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s16
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s11, s14, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s14
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s11
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
-; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_add_u32 s10, s6, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_not_b64 s[18:19], s[6:7]
+; GCN-IR-NEXT:    s_add_u32 s10, s18, s10
+; GCN-IR-NEXT:    s_addc_u32 s11, s19, 0
+; GCN-IR-NEXT:  .LBB0_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
 ; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT:    s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT:    s_subb_u32 s13, s13, s15
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT:    s_ashr_i32 s12, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s13, s12
+; GCN-IR-NEXT:    s_and_b32 s6, s12, 1
+; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_sub_u32 s14, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s15, s15, s13
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT:  .LBB0_4: ; %Flow7
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:  .LBB0_4: ; %udiv-end
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s12, s0
@@ -325,84 +318,73 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-LABEL: v_test_urem_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
-; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
 ; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v9, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[6:7], v8, v9
-; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[6:7], 63, v[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT:    v_min_u32_e32 v4, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v8, v4
+; GCN-IR-NEXT:    v_subb_u32_e64 v10, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v9
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v9
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v7, v8
+; GCN-IR-NEXT:    v_not_b32_e32 v8, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[0:1], v6
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v10
-; GCN-IR-NEXT:    v_not_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v7, v9
+; GCN-IR-NEXT:    v_not_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v8, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT:  .LBB1_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
 ; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
 ; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT:  .LBB1_6: ; %Flow5
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v4, v6
+; GCN-IR-NEXT:  .LBB1_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_mul_lo_u32 v4, v2, v7
 ; GCN-IR-NEXT:    v_mul_hi_u32 v5, v2, v6
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v6
@@ -812,74 +794,67 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-LABEL: s_test_urem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
-; GCN-IR-NEXT:    s_add_u32 s10, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 s[6:7], s[12:13], exec
-; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
+; GCN-IR-NEXT:    s_min_u32 s10, s8, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s10, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[4:5], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s9
+; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s12
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s4, s8, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s8
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s4
 ; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
-; GCN-IR-NEXT:  .LBB6_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT:  .LBB6_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s4, s14, s10
-; GCN-IR-NEXT:    s_subb_u32 s4, s15, s11
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT:    s_ashr_i32 s6, s4, 31
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_and_b32 s4, s6, 1
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s6
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, s7
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[10:11], 0
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_3
-; GCN-IR-NEXT:  .LBB6_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-IR-NEXT:  .LBB6_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:  .LBB6_4: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s8, s0
-; GCN-IR-NEXT:    s_mul_i32 s0, s2, s7
+; GCN-IR-NEXT:    s_mul_i32 s0, s2, s5
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT:    s_mul_i32 s0, s3, s6
+; GCN-IR-NEXT:    s_mul_i32 s0, s3, s4
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
-; GCN-IR-NEXT:    s_mul_i32 s0, s2, s6
+; GCN-IR-NEXT:    s_mul_i32 s0, s2, s4
 ; GCN-IR-NEXT:    v_sub_i32_e64 v0, vcc, 24, s0
 ; GCN-IR-NEXT:    s_mov_b32 s10, -1
 ; GCN-IR-NEXT:    s_mov_b32 s9, s1
@@ -972,75 +947,69 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-LABEL: s_test_urem_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s10, 59, s8
+; GCN-IR-NEXT:    s_min_u32 s6, s6, s7
+; GCN-IR-NEXT:    s_sub_u32 s10, 59, s6
 ; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[6:7], s[10:11], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s3
-; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s2
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s9
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s12
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 0xffffffc4
-; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
-; GCN-IR-NEXT:  .LBB7_3: ; %udiv-do-while
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    s_add_i32 s11, s10, 1
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s10
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s11
+; GCN-IR-NEXT:    s_add_u32 s10, s6, 0xffffffc4
+; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT:  .LBB7_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s4, 23, s10
-; GCN-IR-NEXT:    s_subb_u32 s4, 0, s11
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b32 s12, s12, 24
-; GCN-IR-NEXT:    s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, s11, 0
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshr_b32 s6, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, 23, s12
+; GCN-IR-NEXT:    s_subb_u32 s4, 0, s13
+; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 31
+; GCN-IR-NEXT:    s_and_b32 s6, s4, 1
+; GCN-IR-NEXT:    s_and_b32 s4, s4, 24
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s4
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, 0
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
-; GCN-IR-NEXT:  .LBB7_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-IR-NEXT:  .LBB7_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, 24
-; GCN-IR-NEXT:    s_mov_b32 s8, s0
-; GCN-IR-NEXT:    s_mul_i32 s0, s7, 24
+; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT:    s_lshl_b64 s[4:5], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[4:5]
+; GCN-IR-NEXT:  .LBB7_4: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s8, 24
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mul_i32 s0, s9, 24
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
-; GCN-IR-NEXT:    s_mul_i32 s0, s6, 24
+; GCN-IR-NEXT:    s_mul_i32 s0, s8, 24
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s10, -1
-; GCN-IR-NEXT:    s_mov_b32 s9, s1
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
   store i64 %result, ptr addrspace(1) %out
@@ -1154,8 +1123,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffd0, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
@@ -1182,52 +1151,50 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:  .LBB8_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v2
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
 ; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB8_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB8_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v7
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v4, v6
-; GCN-IR-NEXT:  .LBB8_6: ; %Flow5
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v3
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB8_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v0, v2
-; GCN-IR-NEXT:    v_mul_hi_u32 v3, v0, v5
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v5
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v5
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, v4
+; GCN-IR-NEXT:  .LBB8_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1249,71 +1216,60 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 48, v6
-; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT:    v_min_u32_e32 v2, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], 48, v2
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[4:5]
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_cbranch_execz .LBB9_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v7
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:  .LBB9_2: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v8
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s10, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v10
 ; GCN-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v3
 ; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_3
-; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT:  .LBB9_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
-; GCN-IR-NEXT:  .LBB9_6: ; %Flow5
+; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_2
+; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v2, v4
+; GCN-IR-NEXT:  .LBB9_4: ; %Flow
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[4:5], 15
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
index 77d861ad0599c18..ee911e001b79def 100644
--- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -262,10 +262,8 @@ exit:
 ; for.latch
 ; for.check
 ; test1
-; test2
 ; test3
 ; test4
-; optional1
 ; optional2
 ; optional3
 ; optional4
@@ -282,9 +280,6 @@ exit:
 ;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
 ;CHECK-O3: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
 ;CHECK: # %bb.{{[0-9]+}}: # %test1
-;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: # %test2
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 2
 ;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
@@ -294,10 +289,7 @@ exit:
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 8
 ;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
 ;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK: [[OPT1LABEL]]
-;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 2
-;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
-;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: .[[OPT2LABEL]]
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 4
 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
 ;CHECK-NEXT: .[[OPT3LABEL]]
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 1c95d28b5eed1be..70a619c37bf2517 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -997,14 +997,12 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    cmp r3, #8
 ; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    blo.w .LBB16_12
+; CHECK-NEXT:    blo.w .LBB16_11
 ; CHECK-NEXT:  @ %bb.1: @ %if.then
-; CHECK-NEXT:    lsrs.w r12, r3, #2
-; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
 ; CHECK-NEXT:    ldrh r4, [r0]
 ; CHECK-NEXT:    movs r1, #1
-; CHECK-NEXT:    ldrd r5, r3, [r0, #4]
+; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
+; CHECK-NEXT:    lsr.w r9, r3, #2
 ; CHECK-NEXT:    sub.w r0, r4, #8
 ; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
 ; CHECK-NEXT:    and r0, r0, #7
@@ -1017,7 +1015,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    subs r1, r7, #2
 ; CHECK-NEXT:    rsbs r7, r4, #0
 ; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r7, r3, #16
+; CHECK-NEXT:    add.w r7, r12, #16
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
@@ -1035,7 +1033,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r12, r12, #1
+; CHECK-NEXT:    subs.w r9, r9, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
@@ -1045,15 +1043,15 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    ldrh.w lr, [r3, #14]
+; CHECK-NEXT:    ldrh.w lr, [r12, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
-; CHECK-NEXT:    ldrh.w r8, [r3, #12]
-; CHECK-NEXT:    ldrh r7, [r3, #10]
-; CHECK-NEXT:    ldrh r4, [r3, #8]
-; CHECK-NEXT:    ldrh r6, [r3, #6]
-; CHECK-NEXT:    ldrh.w r9, [r3, #4]
-; CHECK-NEXT:    ldrh.w r11, [r3, #2]
-; CHECK-NEXT:    ldrh.w r10, [r3]
+; CHECK-NEXT:    ldrh.w r8, [r12, #12]
+; CHECK-NEXT:    ldrh.w r7, [r12, #10]
+; CHECK-NEXT:    ldrh.w r4, [r12, #8]
+; CHECK-NEXT:    ldrh.w r3, [r12, #6]
+; CHECK-NEXT:    ldrh.w r6, [r12, #4]
+; CHECK-NEXT:    ldrh.w r11, [r12, #2]
+; CHECK-NEXT:    ldrh.w r10, [r12]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
@@ -1063,10 +1061,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    adds r0, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
-; CHECK-NEXT:    vfma.f16 q0, q1, r9
+; CHECK-NEXT:    vfma.f16 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r0, r5, #10
-; CHECK-NEXT:    vfma.f16 q0, q1, r6
+; CHECK-NEXT:    vfma.f16 q0, q1, r3
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
@@ -1090,25 +1088,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    adds r4, r5, #2
+; CHECK-NEXT:    adds r3, r5, #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
 ; CHECK-NEXT:    ldrh r0, [r6, #-14]
-; CHECK-NEXT:    adds r4, r5, #6
+; CHECK-NEXT:    adds r3, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    ldrh r0, [r6, #-12]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
 ; CHECK-NEXT:    ldrh r0, [r6, #-10]
-; CHECK-NEXT:    add.w r4, r5, #10
+; CHECK-NEXT:    add.w r3, r5, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    ldrh r0, [r6, #-8]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    vldrw.u32 q1, [r3]
 ; CHECK-NEXT:    ldrh r0, [r6, #-6]
-; CHECK-NEXT:    ldrh r4, [r6, #-2]
+; CHECK-NEXT:    ldrh r3, [r6, #-2]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    ldrh r0, [r6, #-4]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
@@ -1128,7 +1126,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r4, [r6], #2
+; CHECK-NEXT:    ldrh r3, [r6], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_11
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 808626d9a0aebe6..332453360a752c2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -983,12 +983,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-LABEL: fir:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cmp r3, #8
-; CHECK-NEXT:    blo.w .LBB16_13
-; CHECK-NEXT:  @ %bb.1: @ %if.then
-; CHECK-NEXT:    lsrs.w r12, r3, #2
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    bxeq lr
-; CHECK-NEXT:  .LBB16_2: @ %while.body.lr.ph
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    bxlo lr
+; CHECK-NEXT:  .LBB16_1: @ %if.then
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
@@ -997,24 +994,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    .pad #32
 ; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    ldrh r6, [r0]
-; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
-; CHECK-NEXT:    sub.w r0, r6, #8
-; CHECK-NEXT:    add.w r3, r0, r0, lsr #29
+; CHECK-NEXT:    ldrh r5, [r0]
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    ldrd r4, r12, [r0, #4]
+; CHECK-NEXT:    lsr.w r10, r3, #2
+; CHECK-NEXT:    sub.w r0, r5, #8
+; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
 ; CHECK-NEXT:    and r0, r0, #7
-; CHECK-NEXT:    asrs r7, r3, #3
-; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    asr.w lr, r7, #3
+; CHECK-NEXT:    cmp.w lr, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r5, r3, #3
-; CHECK-NEXT:    add.w r3, r4, r6, lsl #2
-; CHECK-NEXT:    sub.w r9, r3, #4
-; CHECK-NEXT:    rsbs r3, r6, #0
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r10, #32
-; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    asrgt r6, r7, #3
+; CHECK-NEXT:    add.w r7, r4, r5, lsl #2
+; CHECK-NEXT:    sub.w r9, r7, #4
+; CHECK-NEXT:    rsbs r7, r5, #0
+; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r7, r12, #32
+; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r5, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
@@ -1031,7 +1029,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r12, r12, #1
+; CHECK-NEXT:    subs.w r10, r10, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
 ; CHECK-NEXT:    add.w r4, r0, #16
@@ -1042,24 +1040,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r3, r7, [r10]
-; CHECK-NEXT:    ldm.w lr, {r0, r5, r6, lr}
-; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
+; CHECK-NEXT:    ldrd r7, r6, [r12]
+; CHECK-NEXT:    ldrd r0, r5, [r12, #8]
+; CHECK-NEXT:    ldrd r3, lr, [r12, #16]
+; CHECK-NEXT:    ldrd r11, r8, [r12, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
 ; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT:    vmul.f32 q0, q0, r3
+; CHECK-NEXT:    vmul.f32 q0, q0, r7
 ; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
 ; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r7
+; CHECK-NEXT:    vfma.f32 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r0
 ; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r5
 ; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
-; CHECK-NEXT:    vfma.f32 q0, q5, r6
+; CHECK-NEXT:    vfma.f32 q0, q5, r3
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q2, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
@@ -1106,7 +1105,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r0, [r7], #4
-; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
+; CHECK-NEXT:    vldrw.u32 q1, [r6], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
@@ -1115,7 +1114,6 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:  .LBB16_13: @ %if.end
 ; CHECK-NEXT:    bx lr
 entry:
   %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1

>From 6a68fd9597152d5243133e32051fddeecc08d5af Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 22 Oct 2023 14:19:08 +0800
Subject: [PATCH 4/5] [ValueTracking] Add tests from PR69038. NFC.

---
 .../Analysis/ValueTracking/implied-icmp-binop.ll    | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
index 882c38f329bd884..b9cc3f9a1b1615c 100644
--- a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
@@ -191,3 +191,16 @@ entry:
   %and = and i1 %cmp, %cmp2
   ret i1 %and
 }
+
+define i1 @pr69038(i32 %a, i32 %b) {
+; CHECK-LABEL: define i1 @pr69038(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
+;
+  %tobool = icmp ne i32 %a, 0
+  %or = or i32 %a, %b
+  %tobool1 = icmp ne i32 %or, 0
+  %and = and i1 %tobool, %tobool1
+  ret i1 %and
+}

>From 374533a08d764db6a37f8f70760f3d31f0acb65b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 4 Nov 2023 18:05:54 +0800
Subject: [PATCH 5/5] fixup! [ValueTracking] Improve `isImpliedCondICmps` to
 handle binops

Rebase to resolve conflicts
---
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            | 76 +++++++++----------
 llvm/test/CodeGen/AMDGPU/srem64.ll            | 42 +++++-----
 llvm/test/CodeGen/AMDGPU/udiv64.ll            | 50 +++++-------
 llvm/test/CodeGen/AMDGPU/urem64.ll            | 38 ++++------
 llvm/test/CodeGen/PowerPC/reduce_cr.ll        |  4 +-
 .../CodeGen/Thumb2/mve-float16regloops.ll     | 64 ++++++++--------
 .../CodeGen/Thumb2/mve-float32regloops.ll     | 63 ++++++++-------
 7 files changed, 152 insertions(+), 185 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 950e8c60ef9d01f..cdb1930658e4f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -371,12 +371,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT:    v_min_u32_e32 v12, v2, v3
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v10
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v11
 ; GCN-IR-NEXT:    v_min_u32_e32 v2, v2, v3
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v10
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v11
-; GCN-IR-NEXT:    v_min_u32_e32 v12, v3, v8
-; GCN-IR-NEXT:    v_sub_i32_e32 v13, vcc, v2, v12
+; GCN-IR-NEXT:    v_sub_i32_e32 v13, vcc, v12, v2
 ; GCN-IR-NEXT:    v_subb_u32_e64 v14, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[13:14]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
@@ -395,13 +395,13 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v13
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 63, v13
 ; GCN-IR-NEXT:    v_add_i32_e32 v17, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v18, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
 ; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[10:11], v8
 ; GCN-IR-NEXT:    v_lshr_b64 v[13:14], v[10:11], v14
+; GCN-IR-NEXT:    v_addc_u32_e32 v18, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v10, v12
 ; GCN-IR-NEXT:    v_not_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v2, v12
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
 ; GCN-IR-NEXT:  .LBB1_2: ; %udiv-do-while
@@ -1539,44 +1539,36 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v0, v1
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v5, vcc, s6, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0x8000
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, s6, v8
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[4:5]
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
+; GCN-IR-NEXT:    s_cbranch_execz .LBB12_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 63, v6
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v4
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v9
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v5, vcc
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], s[4:5], v6
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
@@ -1601,18 +1593,18 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
 ; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v13, v1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, v0
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_2
 ; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[6:7], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v0, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN-IR-NEXT:  .LBB12_4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 0191930aabba5a1..90ee3a3da39e0cc 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1725,34 +1725,26 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT:    s_movk_i32 s8, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s8, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v6
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
+; GCN-IR-NEXT:    s_cbranch_execz .LBB12_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v2
@@ -1780,18 +1772,18 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v3
-; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v2
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_2
 ; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-IR-NEXT:  .LBB12_4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 6fc0e90b7724460..eb996692e99ce07 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -315,12 +315,12 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v6, v4, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v6, v8
+; GCN-IR-NEXT:    v_min_u32_e32 v6, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v8, v6
 ; GCN-IR-NEXT:    v_subb_u32_e64 v10, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[9:10]
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
@@ -340,10 +340,10 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_lshr_b64 v[9:10], v[0:1], v10
 ; GCN-IR-NEXT:    v_addc_u32_e32 v14, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v0, v6
+; GCN-IR-NEXT:    v_not_b32_e32 v0, v8
 ; GCN-IR-NEXT:    v_not_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:  .LBB1_2: ; %udiv-do-while
@@ -1132,35 +1132,27 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v3
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v0, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
+; GCN-IR-NEXT:    s_cbranch_execz .LBB9_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v2
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
@@ -1185,18 +1177,18 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v0
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_2
 ; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[1:2], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v0, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
 ; GCN-IR-NEXT:  .LBB9_4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 32768, %x
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index e91053384b3cec0..6264203099699ab 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1124,32 +1124,24 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB8_6
+; GCN-IR-NEXT:    s_cbranch_execz .LBB8_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT:    s_cbranch_execz .LBB8_5
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 63, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v2
@@ -1177,18 +1169,18 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, v3
-; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v2
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB8_2
 ; GCN-IR-NEXT:  ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[3:4], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-IR-NEXT:  .LBB8_4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GCN-IR-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, v2
diff --git a/llvm/test/CodeGen/PowerPC/reduce_cr.ll b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
index 7491d13c5301015..e5761de12670ff9 100644
--- a/llvm/test/CodeGen/PowerPC/reduce_cr.ll
+++ b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
@@ -7,7 +7,7 @@ target triple = "powerpc64le-grtev4-linux-gnu"
 ;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}}
 ;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
 ;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = {{.*}}
-;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = {{.*}}
+;CHECK-NEXT: - BB3[optional2]: float = 0.625, int = {{.*}}
 
 ;CHECK:      block-frequency-info: loop_test
 ;CHECK:      block-frequency-info: loop_test
@@ -19,7 +19,7 @@ target triple = "powerpc64le-grtev4-linux-gnu"
 ;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
 ;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = {{.*}}
 ;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = {{.*}}
-;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = {{.*}}
+;CHECK-NEXT: - BB4[optional2]: float = 0.625, int = {{.*}}
 
 
 define void @loop_test(ptr %tags, i32 %count) {
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 70a619c37bf2517..84ef6b1a02750f7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1019,29 +1019,29 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_6
-; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    b .LBB16_5
+; CHECK-NEXT:  .LBB16_2: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
-; CHECK-NEXT:    b .LBB16_5
-; CHECK-NEXT:  .LBB16_4: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_3: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    wls lr, r0, .LBB16_5
-; CHECK-NEXT:    b .LBB16_10
-; CHECK-NEXT:  .LBB16_5: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    wls lr, r0, .LBB16_4
+; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:  .LBB16_4: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r9, r9, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
-; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  .LBB16_6: @ %while.body
+; CHECK-NEXT:    beq.w .LBB16_11
+; CHECK-NEXT:  .LBB16_5: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_7 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r12, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
@@ -1077,14 +1077,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
-; CHECK-NEXT:    blo .LBB16_9
-; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    blo .LBB16_8
+; CHECK-NEXT:  @ %bb.6: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_8: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT:  .LBB16_7: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
@@ -1114,24 +1114,24 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    adds r5, #16
-; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    le lr, .LBB16_8
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    vfma.f16 q0, q1, r3
+; CHECK-NEXT:    le lr, .LBB16_7
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:  .LBB16_11: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT:  .LBB16_10: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r3, [r6], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
-; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    le lr, .LBB16_11
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_12: @ %if.end
+; CHECK-NEXT:    vfma.f16 q0, q1, r3
+; CHECK-NEXT:    le lr, .LBB16_10
+; CHECK-NEXT:    b .LBB16_2
+; CHECK-NEXT:  .LBB16_11: @ %if.end
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 332453360a752c2..394a8ba8f53d09f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1014,31 +1014,30 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r5, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_6
-; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    b .LBB16_5
+; CHECK-NEXT:  .LBB16_2: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
-; CHECK-NEXT:    b .LBB16_5
-; CHECK-NEXT:  .LBB16_4: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_3: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
-; CHECK-NEXT:    wls lr, r0, .LBB16_5
-; CHECK-NEXT:    b .LBB16_10
-; CHECK-NEXT:  .LBB16_5: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    wls lr, r0, .LBB16_4
+; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:  .LBB16_4: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r10, r10, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
 ; CHECK-NEXT:    add.w r4, r0, #16
-; CHECK-NEXT:    beq .LBB16_12
-; CHECK-NEXT:  .LBB16_6: @ %while.body
+; CHECK-NEXT:    beq .LBB16_11
+; CHECK-NEXT:  .LBB16_5: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT:    add.w lr, r10, #8
+; CHECK-NEXT:    @ Child Loop BB16_7 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    ldrd r7, r6, [r12]
 ; CHECK-NEXT:    ldrd r0, r5, [r12, #8]
@@ -1065,14 +1064,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
-; CHECK-NEXT:    blo .LBB16_9
-; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    blo .LBB16_8
+; CHECK-NEXT:  @ %bb.6: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_8: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT:  .LBB16_7: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
@@ -1093,23 +1092,23 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
-; CHECK-NEXT:    le lr, .LBB16_8
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_7
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:  .LBB16_11: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    mov r6, r4
+; CHECK-NEXT:  .LBB16_10: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r0, [r7], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r6], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    le lr, .LBB16_11
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_12:
+; CHECK-NEXT:    le lr, .LBB16_10
+; CHECK-NEXT:    b .LBB16_2
+; CHECK-NEXT:  .LBB16_11:
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4



More information about the llvm-commits mailing list