[llvm] [ValueTracking] Improve `isImpliedCondICmps` to handle binops (PR #69840)
Yingwei Zheng via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 4 03:06:27 PDT 2023
https://github.com/dtcxzyw updated https://github.com/llvm/llvm-project/pull/69840
>From dd6ee62e1adf2c4400b27a12755c04a326f0268b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 21 Oct 2023 20:43:39 +0800
Subject: [PATCH 1/5] [ValueTracking] Add pre-commit tests from PR68799. NFC.
---
.../ValueTracking/implied-icmp-binop.ll | 223 ++++++++++++++++++
1 file changed, 223 insertions(+)
create mode 100644 llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
new file mode 100644
index 000000000000000..a85214346c5a08a
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Tests from PR68799
+
+define i1 @f_and(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_and(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT: [[AND14:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: [[AND1115:%.*]] = and i1 [[CMP]], [[AND14]]
+; CHECK-NEXT: ret i1 [[AND1115]]
+;
+entry:
+ %cmp = icmp ne i32 %x, 0
+ %0 = or i32 %x, %y
+ %and14 = icmp eq i32 %0, 0
+ %and1115 = and i1 %cmp, %and14
+ ret i1 %and1115
+}
+
+define i1 @f_or(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_or(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT: [[OR14:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT: [[OR1115:%.*]] = or i1 [[CMP_NOT]], [[OR14]]
+; CHECK-NEXT: ret i1 [[OR1115]]
+;
+entry:
+ %cmp.not = icmp eq i32 %x, 0
+ %0 = or i32 %x, %y
+ %or14 = icmp ne i32 %0, 0
+ %or1115 = or i1 %cmp.not, %or14
+ ret i1 %or1115
+}
+
+; Tests for more binops
+
+define i1 @f_add(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 8
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 7
+ %cmp = icmp ult i32 %x, 8
+ %0 = add i32 %yr, %x
+ %cmp2 = icmp ugt i32 %0, 16
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+define i1 @f_add_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add_nsw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 2147483647
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 5
+; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[YR]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 2147483647
+ %cmp = icmp sgt i32 %x, 5
+ %0 = add nsw i32 %yr, %x
+ %cmp2 = icmp slt i32 %0, 5
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+define i1 @f_add_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add_nuw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X]], 1
+; CHECK-NEXT: [[TMP0:%.*]] = add nuw i32 [[X]], [[Y]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 1
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %cmp = icmp ugt i32 %x, 1
+ %0 = add nuw i32 %x, %y
+ %cmp2 = icmp eq i32 %0, 1
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+define i1 @f_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_sub_nsw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 2147483647
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 5
+; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 [[X]], [[YR]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 2147483647
+ %cmp = icmp slt i32 %x, 5
+ %0 = sub nsw i32 %x, %yr
+ %cmp2 = icmp slt i32 %0, 5
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+define i1 @f_sub_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_sub_nuw(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 5
+; CHECK-NEXT: [[TMP0:%.*]] = sub nuw i32 [[X]], [[Y]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 6
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %cmp = icmp ult i32 %x, 5
+ %0 = sub nuw i32 %x, %y
+ %cmp2 = icmp eq i32 %0, 6
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+; Negative tests
+
+; non-constant range
+define i1 @f_add_nofold1(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: define i1 @f_add_nofold1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], [[Z]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 7
+ %cmp = icmp ult i32 %x, %z
+ %0 = add i32 %yr, %x
+ %cmp2 = icmp ugt i32 %0, 16
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+define i1 @f_add_nofold2(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: define i1 @f_add_nofold2(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 8
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], [[Z]]
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 7
+ %cmp = icmp ult i32 %x, 8
+ %0 = add i32 %yr, %x
+ %cmp2 = icmp ugt i32 %0, %z
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+; narrower range
+define i1 @f_add_nofold3(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_add_nofold3(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 8
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 10
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 7
+ %cmp = icmp ult i32 %x, 8
+ %0 = add i32 %yr, %x
+ %cmp2 = icmp ugt i32 %0, 10
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
+
+; sub is not commutative
+define i1 @f_sub_nsw_nofold(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @f_sub_nsw_nofold(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 2147483647
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 5
+; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 [[YR]], [[X]]
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
+; CHECK-NEXT: ret i1 [[AND]]
+;
+entry:
+ %yr = and i32 %y, 2147483647
+ %cmp = icmp slt i32 %x, 5
+ %0 = sub nsw i32 %yr, %x
+ %cmp2 = icmp slt i32 %0, 5
+ %and = and i1 %cmp, %cmp2
+ ret i1 %and
+}
>From 5bba0346e36c77663fa77fd06d0495e7e80fc069 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 21 Oct 2023 20:49:31 +0800
Subject: [PATCH 2/5] [ValueTracking] Improve `isImpliedCondICmps` to handle
binops
---
llvm/lib/Analysis/ValueTracking.cpp | 48 ++++++++++++++++---
.../ValueTracking/implied-icmp-binop.ll | 44 +++--------------
llvm/test/Transforms/InstCombine/icmp-or.ll | 18 ++-----
.../icmp-power2-and-icmp-shifted-mask.ll | 40 ++++------------
4 files changed, 60 insertions(+), 90 deletions(-)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2458c1cb9f8ec1d..8e3577ea8a56482 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8253,14 +8253,11 @@ isImpliedCondMatchingOperands(CmpInst::Predicate LPred,
return std::nullopt;
}
-/// Return true if "icmp LPred X, LC" implies "icmp RPred X, RC" is true.
-/// Return false if "icmp LPred X, LC" implies "icmp RPred X, RC" is false.
+/// Return true if `X in DomCR` implies `X in CR` is true.
+/// Return false if `X in DomCR` implies `X in CR` is false.
/// Otherwise, return std::nullopt if we can't infer anything.
-static std::optional<bool> isImpliedCondCommonOperandWithConstants(
- CmpInst::Predicate LPred, const APInt &LC, CmpInst::Predicate RPred,
- const APInt &RC) {
- ConstantRange DomCR = ConstantRange::makeExactICmpRegion(LPred, LC);
- ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, RC);
+static std::optional<bool> isImpliedCondWithRange(const ConstantRange &DomCR,
+ const ConstantRange &CR) {
ConstantRange Intersection = DomCR.intersectWith(CR);
ConstantRange Difference = DomCR.difference(CR);
if (Intersection.isEmptySet())
@@ -8270,6 +8267,17 @@ static std::optional<bool> isImpliedCondCommonOperandWithConstants(
return std::nullopt;
}
+/// Return true if "icmp LPred X, LC" implies "icmp RPred X, RC" is true.
+/// Return false if "icmp LPred X, LC" implies "icmp RPred X, RC" is false.
+/// Otherwise, return std::nullopt if we can't infer anything.
+static std::optional<bool> isImpliedCondCommonOperandWithConstants(
+ CmpInst::Predicate LPred, const APInt &LC, CmpInst::Predicate RPred,
+ const APInt &RC) {
+ ConstantRange DomCR = ConstantRange::makeExactICmpRegion(LPred, LC);
+ ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, RC);
+ return isImpliedCondWithRange(DomCR, CR);
+}
+
/// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1")
/// is true. Return false if LHS implies RHS is false. Otherwise, return
/// std::nullopt if we can't infer anything.
@@ -8320,6 +8328,32 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
return LPred == RPred;
}
+ // handle R0 = L0 binop V
+ Value *R0Op1 = nullptr;
+ if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)) &&
+ match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) {
+ ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC);
+ ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC);
+ // TODO: use contextual information from SimplifyQuery
+ ConstantRange RHSRange = computeConstantRange(
+ R0Op1, ICmpInst::isSigned(RPred), /*UseInstrInfo*/ true, /*AC*/ nullptr,
+ /*CtxI*/ nullptr, /*DT*/ nullptr, Depth);
+ auto BO = cast<BinaryOperator>(R0);
+ if (BO->getOperand(0) != L0)
+ std::swap(LHSRange, RHSRange);
+ unsigned NoWrapKind = 0;
+ if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
+ if (OBO->hasNoUnsignedWrap())
+ NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap;
+ if (OBO->hasNoSignedWrap())
+ NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
+ }
+ ConstantRange Range =
+ LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind);
+ if (auto Res = isImpliedCondWithRange(Range, CR))
+ return Res;
+ }
+
if (LPred == RPred)
return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
index a85214346c5a08a..882c38f329bd884 100644
--- a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
@@ -7,11 +7,7 @@ define i1 @f_and(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_and(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT: [[TMP0:%.*]] = or i32 [[X]], [[Y]]
-; CHECK-NEXT: [[AND14:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT: [[AND1115:%.*]] = and i1 [[CMP]], [[AND14]]
-; CHECK-NEXT: ret i1 [[AND1115]]
+; CHECK-NEXT: ret i1 false
;
entry:
%cmp = icmp ne i32 %x, 0
@@ -25,11 +21,7 @@ define i1 @f_or(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_or(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP0:%.*]] = or i32 [[X]], [[Y]]
-; CHECK-NEXT: [[OR14:%.*]] = icmp ne i32 [[TMP0]], 0
-; CHECK-NEXT: [[OR1115:%.*]] = or i1 [[CMP_NOT]], [[OR14]]
-; CHECK-NEXT: ret i1 [[OR1115]]
+; CHECK-NEXT: ret i1 true
;
entry:
%cmp.not = icmp eq i32 %x, 0
@@ -45,12 +37,7 @@ define i1 @f_add(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_add(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 7
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 8
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[YR]], [[X]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[TMP0]], 16
-; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT: ret i1 [[AND]]
+; CHECK-NEXT: ret i1 false
;
entry:
%yr = and i32 %y, 7
@@ -65,12 +52,7 @@ define i1 @f_add_nsw(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_add_nsw(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 2147483647
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], 5
-; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[YR]], [[X]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
-; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT: ret i1 [[AND]]
+; CHECK-NEXT: ret i1 false
;
entry:
%yr = and i32 %y, 2147483647
@@ -85,11 +67,7 @@ define i1 @f_add_nuw(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_add_nuw(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[X]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = add nuw i32 [[X]], [[Y]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 1
-; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT: ret i1 [[AND]]
+; CHECK-NEXT: ret i1 false
;
entry:
%cmp = icmp ugt i32 %x, 1
@@ -103,12 +81,8 @@ define i1 @f_sub_nsw(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_sub_nsw(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[YR:%.*]] = and i32 [[Y]], 2147483647
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[X]], 5
-; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 [[X]], [[YR]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP0]], 5
-; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT: ret i1 [[AND]]
+; CHECK-NEXT: ret i1 [[CMP]]
;
entry:
%yr = and i32 %y, 2147483647
@@ -123,11 +97,7 @@ define i1 @f_sub_nuw(i32 %x, i32 %y) {
; CHECK-LABEL: define i1 @f_sub_nuw(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 5
-; CHECK-NEXT: [[TMP0:%.*]] = sub nuw i32 [[X]], [[Y]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 6
-; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], [[CMP2]]
-; CHECK-NEXT: ret i1 [[AND]]
+; CHECK-NEXT: ret i1 false
;
entry:
%cmp = icmp ult i32 %x, 5
diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll
index 922845c1e7e2d82..a96341f31132943 100644
--- a/llvm/test/Transforms/InstCombine/icmp-or.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-or.ll
@@ -430,13 +430,8 @@ define i1 @icmp_or_xor_2_ne_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
define i1 @icmp_or_xor_2_3_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
; CHECK-LABEL: @icmp_or_xor_2_3_fail(
-; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[X1:%.*]], [[Y1:%.*]]
-; CHECK-NEXT: [[XOR1:%.*]] = xor i64 [[X2:%.*]], [[Y2:%.*]]
-; CHECK-NEXT: [[OR:%.*]] = or i64 [[XOR]], [[XOR1]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[OR]], 0
-; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[XOR]], 0
-; CHECK-NEXT: [[OR1:%.*]] = or i1 [[CMP]], [[CMP_1]]
-; CHECK-NEXT: ret i1 [[OR1]]
+; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[X1:%.*]], [[Y1:%.*]]
+; CHECK-NEXT: ret i1 [[CMP_1]]
;
%xor = xor i64 %x1, %y1
%xor1 = xor i64 %x2, %y2
@@ -451,13 +446,8 @@ define i1 @icmp_or_xor_2_3_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
define i1 @icmp_or_xor_2_4_fail(i64 %x1, i64 %y1, i64 %x2, i64 %y2) {
; CHECK-LABEL: @icmp_or_xor_2_4_fail(
-; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[X1:%.*]], [[Y1:%.*]]
-; CHECK-NEXT: [[XOR1:%.*]] = xor i64 [[X2:%.*]], [[Y2:%.*]]
-; CHECK-NEXT: [[OR:%.*]] = or i64 [[XOR]], [[XOR1]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[OR]], 0
-; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[XOR1]], 0
-; CHECK-NEXT: [[OR1:%.*]] = or i1 [[CMP]], [[CMP_1]]
-; CHECK-NEXT: ret i1 [[OR1]]
+; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[X2:%.*]], [[Y2:%.*]]
+; CHECK-NEXT: ret i1 [[CMP_1]]
;
%xor = xor i64 %x1, %y1
%xor1 = xor i64 %x2, %y2
diff --git a/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll b/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll
index 82fcca07a00ac66..27ecc5686066cf1 100644
--- a/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-power2-and-icmp-shifted-mask.ll
@@ -250,10 +250,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_256_239_gap_in_mask_fail(i3
define i1 @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 112
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 112
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 112
@@ -265,10 +262,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_112_mask_to_left_fail(i32 %x) {
define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 112
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 112
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 112
@@ -281,10 +275,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_112_mask_to_left_fail(i32
define i1 @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 56
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 56
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 56
@@ -296,10 +287,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_56_mask_overlap_fail(i32 %x) {
define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 56
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 56
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 56
@@ -312,10 +300,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_56_mask_overlap_fail(i32
define i1 @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 24
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 24
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 24
@@ -327,10 +312,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_24_mask_overlap_fail(i32 %x) {
define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 24
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 24
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 24
@@ -343,10 +325,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_24_mask_overlap_fail(i32
define i1 @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 12
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 12
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T1]], [[T3]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 12
@@ -358,10 +337,7 @@ define i1 @icmp_power2_and_icmp_shifted_mask_8_12_mask_overlap_fail(i32 %x) {
define i1 @icmp_power2_and_icmp_shifted_mask_swapped_8_12_mask_overlap_fail(i32 %x) {
; CHECK-LABEL: @icmp_power2_and_icmp_shifted_mask_swapped_8_12_mask_overlap_fail(
; CHECK-NEXT: [[T1:%.*]] = icmp ult i32 [[X:%.*]], 8
-; CHECK-NEXT: [[T2:%.*]] = and i32 [[X]], 12
-; CHECK-NEXT: [[T3:%.*]] = icmp ne i32 [[T2]], 12
-; CHECK-NEXT: [[T4:%.*]] = and i1 [[T3]], [[T1]]
-; CHECK-NEXT: ret i1 [[T4]]
+; CHECK-NEXT: ret i1 [[T1]]
;
%t1 = icmp ult i32 %x, 8
%t2 = and i32 %x, 12
>From 4d2df870f4bbdcedd512a4862bbe7873e7e58954 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 22 Oct 2023 03:45:15 +0800
Subject: [PATCH 3/5] fixup! [ValueTracking] Improve `isImpliedCondICmps` to
handle binops
---
llvm/lib/Analysis/ValueTracking.cpp | 58 +-
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 593 +++++++--------
llvm/test/CodeGen/AMDGPU/srem64.ll | 678 ++++++++----------
llvm/test/CodeGen/AMDGPU/udiv64.ll | 654 ++++++++---------
llvm/test/CodeGen/AMDGPU/urem64.ll | 534 +++++++-------
llvm/test/CodeGen/PowerPC/tail-dup-layout.ll | 10 +-
.../CodeGen/Thumb2/mve-float16regloops.ll | 48 +-
.../CodeGen/Thumb2/mve-float32regloops.ll | 60 +-
8 files changed, 1201 insertions(+), 1434 deletions(-)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8e3577ea8a56482..255298c01185450 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8302,8 +8302,36 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
// Can we infer anything when the 0-operands match and the 1-operands are
// constants (not necessarily matching)?
const APInt *LC, *RC;
- if (L0 == R0 && match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)))
- return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
+ if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC))) {
+ if (L0 == R0)
+ return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
+
+ // handle R0 = L0 binop V and R0 = V binop L0
+ Value *R0Op1 = nullptr;
+ if (match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) {
+ ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC);
+ ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC);
+ // TODO: use contextual information from SimplifyQuery
+ ConstantRange RHSRange =
+ computeConstantRange(R0Op1, ICmpInst::isSigned(RPred),
+ /*UseInstrInfo*/ true, /*AC*/ nullptr,
+ /*CtxI*/ nullptr, /*DT*/ nullptr, Depth);
+ auto *BO = cast<BinaryOperator>(R0);
+ if (BO->getOperand(0) != L0)
+ std::swap(LHSRange, RHSRange);
+ unsigned NoWrapKind = 0;
+ if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
+ if (OBO->hasNoUnsignedWrap())
+ NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap;
+ if (OBO->hasNoSignedWrap())
+ NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
+ }
+ ConstantRange Range =
+ LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind);
+ if (auto Res = isImpliedCondWithRange(Range, CR))
+ return Res;
+ }
+ }
// L0 = R0 = L1 + R1, L0 >=u L1 implies R0 >=u R1, L0 <u L1 implies R0 <u R1
if (ICmpInst::isUnsigned(LPred) && ICmpInst::isUnsigned(RPred)) {
@@ -8328,32 +8356,6 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
return LPred == RPred;
}
- // handle R0 = L0 binop V
- Value *R0Op1 = nullptr;
- if (match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)) &&
- match(R0, m_c_BinOp(m_Specific(L0), m_Value(R0Op1)))) {
- ConstantRange LHSRange = ConstantRange::makeExactICmpRegion(LPred, *LC);
- ConstantRange CR = ConstantRange::makeExactICmpRegion(RPred, *RC);
- // TODO: use contextual information from SimplifyQuery
- ConstantRange RHSRange = computeConstantRange(
- R0Op1, ICmpInst::isSigned(RPred), /*UseInstrInfo*/ true, /*AC*/ nullptr,
- /*CtxI*/ nullptr, /*DT*/ nullptr, Depth);
- auto BO = cast<BinaryOperator>(R0);
- if (BO->getOperand(0) != L0)
- std::swap(LHSRange, RHSRange);
- unsigned NoWrapKind = 0;
- if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
- if (OBO->hasNoUnsignedWrap())
- NoWrapKind |= OverflowingBinaryOperator::NoUnsignedWrap;
- if (OBO->hasNoSignedWrap())
- NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
- }
- ConstantRange Range =
- LHSRange.overflowingBinaryOp(BO->getOpcode(), RHSRange, NoWrapKind);
- if (auto Res = isImpliedCondWithRange(Range, CR))
- return Res;
- }
-
if (LPred == RPred)
return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 9cb6842ae0a1827..950e8c60ef9d01f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -142,7 +142,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
-; GCN-IR-NEXT: s_mov_b32 s15, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31
; GCN-IR-NEXT: s_mov_b32 s1, s0
@@ -156,16 +155,16 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT: s_flbit_i32_b32 s14, s13
; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9]
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT: s_min_u32 s14, s8, s9
-; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT: s_add_i32 s8, s8, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT: s_min_u32 s18, s8, s9
-; GCN-IR-NEXT: s_sub_u32 s16, s14, s18
+; GCN-IR-NEXT: s_min_u32 s8, s8, s9
+; GCN-IR-NEXT: s_flbit_i32_b32 s9, s12
+; GCN-IR-NEXT: s_add_i32 s9, s9, 32
+; GCN-IR-NEXT: s_min_u32 s18, s9, s14
+; GCN-IR-NEXT: s_sub_u32 s16, s8, s18
; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
@@ -174,27 +173,21 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13
; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12
; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23]
-; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s20, s16, 1
-; GCN-IR-NEXT: s_addc_u32 s21, s17, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0
-; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16
+; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s17, s16, 1
+; GCN-IR-NEXT: s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s17
; GCN-IR-NEXT: s_add_u32 s19, s6, -1
; GCN-IR-NEXT: s_addc_u32 s20, s7, -1
-; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT: s_add_u32 s12, s8, s18
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT: s_mov_b32 s9, 0
-; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[12:13], s[8:9]
+; GCN-IR-NEXT: s_add_u32 s12, s12, s18
+; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1
; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31
@@ -214,11 +207,11 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0
; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT: .LBB0_4: ; %Flow7
+; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT: .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB0_4: ; %udiv-end
; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1]
; GCN-IR-NEXT: s_sub_u32 s0, s2, s0
@@ -372,86 +365,75 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
-; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
-; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v12, v2, v3
-; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v10
-; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
-; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v11
-; GCN-IR-NEXT: v_min_u32_e32 v13, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v12, v13
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3]
+; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT: v_min_u32_e32 v2, v2, v3
+; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v10
+; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3
+; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v11
+; GCN-IR-NEXT: v_min_u32_e32 v12, v3, v8
+; GCN-IR-NEXT: v_sub_i32_e32 v13, vcc, v2, v12
+; GCN-IR-NEXT: v_subb_u32_e64 v14, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[13:14]
; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[13:14]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB1_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[10:11], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v9, v12
-; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[10:11], v14
-; GCN-IR-NEXT: v_not_b32_e32 v8, 0
-; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v13
-; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v13
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v13
+; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, -1, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v2, v2
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[10:11], v8
+; GCN-IR-NEXT: v_lshr_b64 v[13:14], v[10:11], v14
+; GCN-IR-NEXT: v_not_b32_e32 v11, 0
+; GCN-IR-NEXT: v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v2, v12
+; GCN-IR-NEXT: v_mov_b32_e32 v16, 0
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v14
-; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v15, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8
+; GCN-IR-NEXT: v_lshl_b64 v[13:14], v[13:14], 1
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v9
+; GCN-IR-NEXT: v_or_b32_e32 v12, v13, v2
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v17, v12
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v18, v14, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v2
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10
-; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT: v_or_b32_e32 v8, v15, v8
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v13
+; GCN-IR-NEXT: v_and_b32_e32 v15, v13, v1
+; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v0
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v12
-; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v9
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v8
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0
-; GCN-IR-NEXT: .LBB1_6: ; %Flow5
+; GCN-IR-NEXT: v_sub_i32_e64 v13, s[4:5], v12, v13
+; GCN-IR-NEXT: v_or_b32_e32 v9, v16, v9
+; GCN-IR-NEXT: v_subb_u32_e64 v14, s[4:5], v14, v15, s[4:5]
+; GCN-IR-NEXT: v_mov_b32_e32 v16, v3
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v15, v2
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_or_b32_e32 v8, v2, v8
+; GCN-IR-NEXT: .LBB1_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4
; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6
; GCN-IR-NEXT: v_xor_b32_e32 v3, v8, v0
@@ -971,7 +953,6 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-LABEL: s_test_sdiv24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GCN-IR-NEXT: s_mov_b32 s15, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24
@@ -993,16 +974,16 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-NEXT: s_subb_u32 s7, s7, s4
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
+; GCN-IR-NEXT: s_flbit_i32_b32 s14, s13
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT: s_min_u32 s14, s8, s9
-; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT: s_add_i32 s8, s8, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT: s_min_u32 s18, s8, s9
-; GCN-IR-NEXT: s_sub_u32 s16, s14, s18
+; GCN-IR-NEXT: s_min_u32 s8, s8, s9
+; GCN-IR-NEXT: s_flbit_i32_b32 s9, s12
+; GCN-IR-NEXT: s_add_i32 s9, s9, 32
+; GCN-IR-NEXT: s_min_u32 s18, s9, s14
+; GCN-IR-NEXT: s_sub_u32 s16, s8, s18
; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
@@ -1011,27 +992,21 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13
; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12
; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23]
-; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s20, s16, 1
-; GCN-IR-NEXT: s_addc_u32 s21, s17, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0
-; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16
+; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s17, s16, 1
+; GCN-IR-NEXT: s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s17
; GCN-IR-NEXT: s_add_u32 s19, s6, -1
; GCN-IR-NEXT: s_addc_u32 s20, s7, -1
-; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT: s_add_u32 s12, s8, s18
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT: s_mov_b32 s9, 0
-; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[12:13], s[8:9]
+; GCN-IR-NEXT: s_add_u32 s12, s12, s18
+; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1
; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31
@@ -1051,11 +1026,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0
; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT: .LBB9_4: ; %Flow4
+; GCN-IR-NEXT: s_cbranch_vccz .LBB9_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT: .LBB9_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB9_4: ; %udiv-end
; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1]
@@ -1196,7 +1171,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-LABEL: s_test_sdiv_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31
; GCN-IR-NEXT: s_mov_b32 s5, s4
@@ -1206,61 +1181,54 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2
; GCN-IR-NEXT: s_add_i32 s10, s10, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT: s_min_u32 s10, s10, s11
-; GCN-IR-NEXT: s_add_u32 s12, s10, 0xffffffc5
-; GCN-IR-NEXT: s_addc_u32 s13, 0, -1
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63
-; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15]
-; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec
-; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24
+; GCN-IR-NEXT: s_min_u32 s12, s10, s11
+; GCN-IR-NEXT: s_add_u32 s10, s12, 0xffffffc5
+; GCN-IR-NEXT: s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT: s_or_b64 s[14:15], s[6:7], s[14:15]
+; GCN-IR-NEXT: s_and_b64 s[6:7], s[14:15], exec
+; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24
; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT: s_mov_b32 s9, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s15, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
-; GCN-IR-NEXT: s_sub_i32 s11, 63, s12
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s11
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s14
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s6, s10, 1
+; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], 24, s10
+; GCN-IR-NEXT: s_lshr_b64 s[14:15], 24, s6
; GCN-IR-NEXT: s_add_u32 s16, s2, -1
; GCN-IR-NEXT: s_addc_u32 s17, s3, -1
-; GCN-IR-NEXT: s_sub_u32 s10, 58, s10
-; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_sub_u32 s12, 58, s12
+; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT: .LBB10_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
-; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT: s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s6, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s15
-; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-IR-NEXT: s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT: s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT: s_ashr_i32 s8, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s9, s8
+; GCN-IR-NEXT: s_and_b32 s6, s8, 1
+; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s14, s14, s8
+; GCN-IR-NEXT: s_subb_u32 s15, s15, s9
+; GCN-IR-NEXT: s_add_u32 s12, s12, 1
+; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3
-; GCN-IR-NEXT: .LBB10_4: ; %Flow6
-; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3]
-; GCN-IR-NEXT: .LBB10_5: ; %udiv-end
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_vccz .LBB10_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT: .LBB10_4: ; %udiv-end
+; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s4, s6, s4
; GCN-IR-NEXT: s_subb_u32 s5, s7, s5
; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
@@ -1388,82 +1356,72 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
-; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5
+; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1
; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8
-; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6]
-; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, s6, v8
+; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[4:5]
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 24, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB11_6
+; GCN-IR-NEXT: s_cbranch_execz .LBB11_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB11_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v9
-; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v6
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v4
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v5, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], 24, v6
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v0
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT: .LBB11_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v7
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v0
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v14, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v15, v11, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v0
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v12
+; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v5
+; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v4
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB11_5: ; %Flow4
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v1
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v12, v0
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB11_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1
-; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0
-; GCN-IR-NEXT: .LBB11_6: ; %Flow5
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], 1
+; GCN-IR-NEXT: v_or_b32_e32 v4, v0, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v1, v5
+; GCN-IR-NEXT: .LBB11_4: ; %Flow
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
@@ -1620,45 +1578,43 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v9
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT: .LBB12_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v7
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v0
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v14, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v15, v11, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v0
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v1
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v0
+; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v12
+; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v5
+; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v4
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1
-; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0
-; GCN-IR-NEXT: .LBB12_6: ; %Flow5
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v1
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v12, v0
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2
-; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], 1
+; GCN-IR-NEXT: v_or_b32_e32 v4, v0, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v1, v5
+; GCN-IR-NEXT: .LBB12_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
@@ -1683,81 +1639,70 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1
-; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v2
-; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v1, v2, vcc
-; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v7
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
+; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v1, v2, vcc
+; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v6
; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0
-; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v8
+; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v7
; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v1
-; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 48, v0
-; GCN-IR-NEXT: v_subb_u32_e64 v4, s[4:5], 0, 0, s[4:5]
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[3:4]
-; GCN-IR-NEXT: v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], 48, v0
+; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB13_6
+; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5]
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB13_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v3
-; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v3
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[7:8], v3
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB13_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[7:8], v9
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 0xffffffcf, v0
+; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v8
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v8
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], v4
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v9
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v0
+; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT: .LBB13_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v4
-; GCN-IR-NEXT: v_or_b32_e32 v0, v9, v0
-; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, s12, v0
-; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v10, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v7
-; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9
-; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB13_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT: .LBB13_6: ; %Flow5
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v5
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s10, v8
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v0
+; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
+; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v1
+; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v0
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB13_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v2
-; GCN-IR-NEXT: v_xor_b32_e32 v3, v6, v1
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v4, v0, v4
+; GCN-IR-NEXT: .LBB13_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v2
+; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %x, 32768
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 520ec6e24ae3bfe..0191930aabba5a1 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -124,73 +124,66 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5
; GCN-IR-NEXT: s_add_i32 s10, s10, 32
-; GCN-IR-NEXT: s_add_i32 s6, s6, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT: s_min_u32 s10, s10, s11
-; GCN-IR-NEXT: s_min_u32 s14, s6, s7
-; GCN-IR-NEXT: s_sub_u32 s12, s10, s14
-; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT: s_min_u32 s6, s10, s11
+; GCN-IR-NEXT: s_add_i32 s7, s7, 32
+; GCN-IR-NEXT: s_flbit_i32_b32 s10, s3
+; GCN-IR-NEXT: s_min_u32 s10, s7, s10
+; GCN-IR-NEXT: s_sub_u32 s14, s6, s10
+; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17]
; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec
; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_mov_b32 s11, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s16, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
-; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s11, s14, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s14
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s11
; GCN-IR-NEXT: s_add_u32 s16, s4, -1
; GCN-IR-NEXT: s_addc_u32 s17, s5, -1
-; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s10, s6, s14
-; GCN-IR-NEXT: s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[18:19], s[6:7]
+; GCN-IR-NEXT: s_add_u32 s10, s18, s10
+; GCN-IR-NEXT: s_addc_u32 s11, s19, 0
+; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1
; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT: s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s6, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s15
+; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT: s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s13, s12
+; GCN-IR-NEXT: s_and_b32 s6, s12, 1
+; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT: s_sub_u32 s14, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s15, s15, s13
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT: .LBB0_4: ; %Flow7
+; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT: .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB0_4: ; %udiv-end
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0
; GCN-IR-NEXT: s_mov_b32 s12, s0
@@ -349,85 +342,74 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v6
; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v2
-; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6
+; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 32, v6
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3
; GCN-IR-NEXT: v_min_u32_e32 v10, v6, v7
; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v0
-; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 32, v6
; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1
-; GCN-IR-NEXT: v_min_u32_e32 v11, v6, v7
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v10, v11
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7]
-; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT: v_min_u32_e32 v6, v6, v7
+; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v10, v6
+; GCN-IR-NEXT: v_subb_u32_e64 v12, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[11:12]
; GCN-IR-NEXT: v_mov_b32_e32 v5, v4
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[11:12]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB1_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v6
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v11
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v11
; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v9, v10
+; GCN-IR-NEXT: v_not_b32_e32 v10, v10
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[0:1], v8
; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT: v_not_b32_e32 v8, 0
-; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v11
+; GCN-IR-NEXT: v_not_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v10, v6
; GCN-IR-NEXT: v_mov_b32_e32 v15, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v7
-; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v12
-; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v13, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v6, v14, v6
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v9
+; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v12
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v13, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v8, v14, v8
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6
; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10
-; GCN-IR-NEXT: v_or_b32_e32 v7, v15, v7
-; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14
+; GCN-IR-NEXT: v_or_b32_e32 v9, v15, v9
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14
; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3
; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v14
; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v15, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v15, v9
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v14, v8
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
-; GCN-IR-NEXT: .LBB1_6: ; %Flow5
+; GCN-IR-NEXT: v_mov_b32_e32 v15, v7
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v14, v6
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_or_b32_e32 v8, v6, v8
+; GCN-IR-NEXT: .LBB1_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_mul_lo_u32 v6, v2, v9
; GCN-IR-NEXT: v_mul_hi_u32 v7, v2, v8
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v8
@@ -1013,7 +995,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-IR-NEXT: s_mov_b32 s13, 0
+; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31
; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31
@@ -1029,69 +1011,62 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_subb_u32 s9, s7, s10
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
+; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3
; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8
; GCN-IR-NEXT: s_add_i32 s6, s6, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s7, s9
-; GCN-IR-NEXT: s_min_u32 s12, s6, s7
-; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT: s_add_i32 s6, s6, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT: s_min_u32 s16, s6, s7
-; GCN-IR-NEXT: s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT: s_min_u32 s6, s6, s7
+; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT: s_add_i32 s7, s7, 32
+; GCN-IR-NEXT: s_min_u32 s12, s7, s12
+; GCN-IR-NEXT: s_sub_u32 s16, s6, s12
+; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 63
; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19]
; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec
; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2
; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s18, s14, 1
-; GCN-IR-NEXT: s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s13, s16, 1
+; GCN-IR-NEXT: s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10
+; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[2:3], s13
; GCN-IR-NEXT: s_add_u32 s18, s8, -1
; GCN-IR-NEXT: s_addc_u32 s19, s9, -1
-; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13]
-; GCN-IR-NEXT: s_add_u32 s12, s6, s16
-; GCN-IR-NEXT: s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[20:21], s[6:7]
+; GCN-IR-NEXT: s_add_u32 s12, s20, s12
+; GCN-IR-NEXT: s_addc_u32 s13, s21, 0
+; GCN-IR-NEXT: .LBB8_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1
; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s6, s18, s14
-; GCN-IR-NEXT: s_subb_u32 s6, s19, s15
-; GCN-IR-NEXT: s_ashr_i32 s16, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s17, s16
-; GCN-IR-NEXT: s_and_b32 s6, s16, 1
-; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT: s_subb_u32 s15, s15, s17
+; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11]
+; GCN-IR-NEXT: s_sub_u32 s6, s18, s16
+; GCN-IR-NEXT: s_subb_u32 s6, s19, s17
+; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s15, s14
+; GCN-IR-NEXT: s_and_b32 s6, s14, 1
+; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s16, s16, s14
+; GCN-IR-NEXT: s_subb_u32 s17, s17, s15
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7]
+; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3
-; GCN-IR-NEXT: .LBB8_4: ; %Flow7
+; GCN-IR-NEXT: s_cbranch_vccz .LBB8_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GCN-IR-NEXT: .LBB8_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB8_4: ; %udiv-end
; GCN-IR-NEXT: v_mov_b32_e32 v0, s10
; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0
; GCN-IR-NEXT: s_mul_i32 s11, s8, s11
@@ -1158,7 +1133,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-LABEL: s_test_srem24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GCN-IR-NEXT: s_mov_b32 s13, 0
+; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_sext_i32_i16 s7, s7
@@ -1180,69 +1155,62 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-NEXT: s_subb_u32 s7, s7, s10
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT: s_flbit_i32_b32 s12, s5
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT: s_min_u32 s12, s8, s9
-; GCN-IR-NEXT: s_flbit_i32_b32 s8, s4
-; GCN-IR-NEXT: s_add_i32 s8, s8, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s9, s5
-; GCN-IR-NEXT: s_min_u32 s16, s8, s9
-; GCN-IR-NEXT: s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT: s_min_u32 s8, s8, s9
+; GCN-IR-NEXT: s_flbit_i32_b32 s9, s4
+; GCN-IR-NEXT: s_add_i32 s9, s9, 32
+; GCN-IR-NEXT: s_min_u32 s12, s9, s12
+; GCN-IR-NEXT: s_sub_u32 s16, s8, s12
+; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 63
; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19]
; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec
; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5
; GCN-IR-NEXT: s_cselect_b32 s10, 0, s4
; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s18, s14, 1
-; GCN-IR-NEXT: s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14
+; GCN-IR-NEXT: s_mov_b32 s9, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s18
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s13, s16, 1
+; GCN-IR-NEXT: s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
+; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[4:5], s13
; GCN-IR-NEXT: s_add_u32 s18, s6, -1
; GCN-IR-NEXT: s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT: s_add_u32 s12, s8, s16
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT: s_mov_b32 s9, 0
-; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[20:21], s[8:9]
+; GCN-IR-NEXT: s_add_u32 s12, s20, s12
+; GCN-IR-NEXT: s_addc_u32 s13, s21, 0
+; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1
; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s8, s18, s14
-; GCN-IR-NEXT: s_subb_u32 s8, s19, s15
-; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT: s_mov_b32 s17, s16
-; GCN-IR-NEXT: s_and_b32 s8, s16, 1
-; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT: s_subb_u32 s15, s15, s17
+; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11]
+; GCN-IR-NEXT: s_sub_u32 s8, s18, s16
+; GCN-IR-NEXT: s_subb_u32 s8, s19, s17
+; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31
+; GCN-IR-NEXT: s_mov_b32 s15, s14
+; GCN-IR-NEXT: s_and_b32 s8, s14, 1
+; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s16, s16, s14
+; GCN-IR-NEXT: s_subb_u32 s17, s17, s15
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT: .LBB9_4: ; %Flow4
+; GCN-IR-NEXT: s_cbranch_vccz .LBB9_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB9_4: ; %udiv-end
; GCN-IR-NEXT: v_mov_b32_e32 v0, s10
; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
@@ -1386,76 +1354,69 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-LABEL: s_test_srem_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31
-; GCN-IR-NEXT: s_mov_b32 s9, s8
-; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s4, s2, s8
-; GCN-IR-NEXT: s_subb_u32 s5, s3, s8
+; GCN-IR-NEXT: s_ashr_i32 s6, s3, 31
+; GCN-IR-NEXT: s_mov_b32 s7, s6
+; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GCN-IR-NEXT: s_sub_u32 s4, s2, s6
+; GCN-IR-NEXT: s_subb_u32 s5, s3, s6
; GCN-IR-NEXT: s_flbit_i32_b32 s2, s4
; GCN-IR-NEXT: s_add_i32 s2, s2, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s3, s5
-; GCN-IR-NEXT: s_min_u32 s8, s2, s3
-; GCN-IR-NEXT: s_add_u32 s2, s8, 0xffffffc5
+; GCN-IR-NEXT: s_min_u32 s10, s2, s3
+; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc5
; GCN-IR-NEXT: s_addc_u32 s3, 0, -1
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[2:3], 63
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 63
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[10:11], s[12:13]
-; GCN-IR-NEXT: s_and_b64 s[10:11], s[12:13], exec
-; GCN-IR-NEXT: s_cselect_b32 s10, 0, 24
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13]
+; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec
+; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT: s_mov_b32 s11, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
+; GCN-IR-NEXT: s_mov_b32 s7, 0
+; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s10, s2, 1
-; GCN-IR-NEXT: s_addc_u32 s11, s3, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT: s_add_i32 s6, s2, 1
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
; GCN-IR-NEXT: s_lshl_b64 s[2:3], 24, s2
-; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s6
; GCN-IR-NEXT: s_add_u32 s14, s4, -1
; GCN-IR-NEXT: s_addc_u32 s15, s5, -1
-; GCN-IR-NEXT: s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_sub_u32 s10, 58, s10
+; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT: .LBB10_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
; GCN-IR-NEXT: s_lshr_b32 s6, s3, 31
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s6, s14, s10
-; GCN-IR-NEXT: s_subb_u32 s6, s15, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_and_b32 s6, s12, 1
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s6, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s6, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s8, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s9, s8
+; GCN-IR-NEXT: s_and_b32 s6, s8, 1
+; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], s[4:5]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s8
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s9
+; GCN-IR-NEXT: s_add_u32 s10, s10, 1
+; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3
-; GCN-IR-NEXT: .LBB10_4: ; %Flow6
+; GCN-IR-NEXT: s_cbranch_vccz .LBB10_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[2:3]
-; GCN-IR-NEXT: .LBB10_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT: .LBB10_4: ; %udiv-end
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0
-; GCN-IR-NEXT: s_mul_i32 s6, s4, s11
-; GCN-IR-NEXT: s_mul_i32 s5, s5, s10
-; GCN-IR-NEXT: s_mul_i32 s4, s4, s10
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT: s_mul_i32 s7, s4, s7
+; GCN-IR-NEXT: s_mul_i32 s5, s5, s6
+; GCN-IR-NEXT: s_mul_i32 s4, s4, s6
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s7, v0
; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s5, v0
; GCN-IR-NEXT: v_sub_i32_e64 v0, vcc, 24, s4
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
@@ -1584,78 +1545,68 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 24, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB11_6
+; GCN-IR-NEXT: s_cbranch_execz .LBB11_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB11_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v7
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v2
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT: .LBB11_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v12, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB11_5: ; %Flow4
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v3
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB11_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7
-; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6
-; GCN-IR-NEXT: .LBB11_6: ; %Flow5
+; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v3, v4
+; GCN-IR-NEXT: .LBB11_4: ; %Flow
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2
-; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5
-; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5
-; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
@@ -1774,9 +1725,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT: s_movk_i32 s8, 0xffd0
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s8, v6
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
@@ -1803,52 +1754,50 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v2
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT: .LBB12_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v12, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7
-; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6
-; GCN-IR-NEXT: .LBB12_6: ; %Flow5
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v3
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2
-; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5
-; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5
-; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v3, v4
+; GCN-IR-NEXT: .LBB12_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
@@ -1877,75 +1826,64 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v0
-; GCN-IR-NEXT: v_add_i32_e64 v3, s[4:5], 32, v3
-; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v1
-; GCN-IR-NEXT: v_min_u32_e32 v8, v3, v4
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v8
-; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
+; GCN-IR-NEXT: v_add_i32_e64 v4, s[4:5], 32, v4
+; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
+; GCN-IR-NEXT: v_min_u32_e32 v4, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], 48, v4
+; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[8:9]
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB13_6
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB13_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB13_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v8
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v8
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6
; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v4
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT: .LBB13_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, s12, v10
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v10
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12
; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v12
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v5
; GCN-IR-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB13_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: .LBB13_6: ; %Flow5
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v12, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB13_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_or_b32_e32 v6, v4, v6
+; GCN-IR-NEXT: .LBB13_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], 15
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index e809292aad1d38b..6fc0e90b7724460 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -129,15 +129,15 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5
; GCN-IR-NEXT: s_add_i32 s10, s10, 32
-; GCN-IR-NEXT: s_add_i32 s6, s6, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT: s_min_u32 s10, s10, s11
-; GCN-IR-NEXT: s_min_u32 s14, s6, s7
-; GCN-IR-NEXT: s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT: s_min_u32 s6, s10, s11
+; GCN-IR-NEXT: s_add_i32 s7, s7, 32
+; GCN-IR-NEXT: s_flbit_i32_b32 s10, s3
+; GCN-IR-NEXT: s_min_u32 s14, s7, s10
+; GCN-IR-NEXT: s_sub_u32 s12, s6, s14
; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
@@ -146,28 +146,21 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_mov_b32 s11, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s16, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
-; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s13, s12, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s12
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s13
; GCN-IR-NEXT: s_add_u32 s15, s4, -1
; GCN-IR-NEXT: s_addc_u32 s16, s5, -1
-; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11]
+; GCN-IR-NEXT: s_not_b64 s[2:3], s[6:7]
; GCN-IR-NEXT: s_add_u32 s2, s2, s14
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
@@ -187,11 +180,11 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT: .LBB0_4: ; %Flow7
+; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3]
-; GCN-IR-NEXT: .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB0_4: ; %udiv-end
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
@@ -316,86 +309,75 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-LABEL: v_test_udiv_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
-; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v6, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v8, v9
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7]
-; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5]
-; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v6, v8
+; GCN-IR-NEXT: v_subb_u32_e64 v10, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5]
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB1_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6
-; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v9
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v9
+; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, -1, v2
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v0, v8
+; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[0:1], v10
+; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v0, v6
; GCN-IR-NEXT: v_not_b32_e32 v1, 0
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT: v_lshl_b64 v[9:10], v[9:10], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT: v_or_b32_e32 v8, v9, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v13, v8
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v14, v10, vcc
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v6
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v3
-; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_or_b32_e32 v4, v11, v4
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v9
+; GCN-IR-NEXT: v_and_b32_e32 v11, v9, v3
+; GCN-IR-NEXT: v_and_b32_e32 v9, v9, v2
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v8
-; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v9, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
-; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1
-; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0
-; GCN-IR-NEXT: .LBB1_6: ; %Flow5
+; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v8, v9
+; GCN-IR-NEXT: v_or_b32_e32 v5, v12, v5
+; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v11, s[4:5]
+; GCN-IR-NEXT: v_mov_b32_e32 v12, v7
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v6
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mov_b32_e32 v0, v5
-; GCN-IR-NEXT: v_mov_b32_e32 v1, v4
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4
+; GCN-IR-NEXT: .LBB1_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v0, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v1, v5
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = udiv i64 %x, %y
ret i64 %result
@@ -784,7 +766,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-IR-LABEL: s_test_udiv24_i48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GCN-IR-NEXT: s_mov_b32 s11, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff
; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000
@@ -796,16 +777,16 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT: s_flbit_i32_b32 s10, s9
; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2
; GCN-IR-NEXT: s_add_i32 s4, s4, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT: s_min_u32 s10, s4, s5
-; GCN-IR-NEXT: s_flbit_i32_b32 s4, s8
-; GCN-IR-NEXT: s_add_i32 s4, s4, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s5, s9
-; GCN-IR-NEXT: s_min_u32 s14, s4, s5
-; GCN-IR-NEXT: s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT: s_min_u32 s4, s4, s5
+; GCN-IR-NEXT: s_flbit_i32_b32 s5, s8
+; GCN-IR-NEXT: s_add_i32 s5, s5, 32
+; GCN-IR-NEXT: s_min_u32 s14, s5, s10
+; GCN-IR-NEXT: s_sub_u32 s12, s4, s14
; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
@@ -814,27 +795,21 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9
; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8
; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s16, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[16:17], 0
-; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12
+; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s16
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s13, s12, 1
+; GCN-IR-NEXT: s_sub_i32 s6, 63, s12
+; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s6
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s13
; GCN-IR-NEXT: s_add_u32 s15, s2, -1
; GCN-IR-NEXT: s_addc_u32 s16, s3, -1
-; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s8, s4, s14
-; GCN-IR-NEXT: s_addc_u32 s9, s5, 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_add_u32 s8, s8, s14
+; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
+; GCN-IR-NEXT: .LBB7_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
@@ -854,11 +829,11 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3
-; GCN-IR-NEXT: .LBB7_4: ; %Flow4
+; GCN-IR-NEXT: s_cbranch_vccz .LBB7_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1
; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-IR-NEXT: .LBB7_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB7_4: ; %udiv-end
; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
@@ -984,69 +959,62 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-LABEL: s_test_udiv_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
-; GCN-IR-NEXT: s_min_u32 s8, s8, s9
-; GCN-IR-NEXT: s_add_u32 s10, s8, 0xffffffc5
-; GCN-IR-NEXT: s_addc_u32 s11, 0, -1
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13]
-; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec
-; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24
+; GCN-IR-NEXT: s_min_u32 s10, s8, s9
+; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5
+; GCN-IR-NEXT: s_addc_u32 s9, 0, -1
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13]
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[12:13], exec
+; GCN-IR-NEXT: s_cselect_b32 s4, 0, 24
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s12, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT: s_sub_i32 s9, 63, s10
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9
+; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s4, s8, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s4
; GCN-IR-NEXT: s_add_u32 s14, s2, -1
; GCN-IR-NEXT: s_addc_u32 s15, s3, -1
-; GCN-IR-NEXT: s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_sub_u32 s10, 58, s10
+; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT: .LBB8_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, s14, s10
-; GCN-IR-NEXT: s_subb_u32 s4, s15, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_and_b32 s4, s12, 1
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s6, s4, 31
+; GCN-IR-NEXT: s_mov_b32 s7, s6
+; GCN-IR-NEXT: s_and_b32 s4, s6, 1
+; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s6
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s7
+; GCN-IR-NEXT: s_add_u32 s10, s10, 1
+; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 0
+; GCN-IR-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3
-; GCN-IR-NEXT: .LBB8_4: ; %Flow6
-; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-IR-NEXT: .LBB8_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT: s_cbranch_vccz .LBB8_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[2:3]
+; GCN-IR-NEXT: .LBB8_4: ; %udiv-end
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
-; GCN-IR-NEXT: v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT: v_mov_b32_e32 v1, s5
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
%result = udiv i64 24, %x
@@ -1157,13 +1125,15 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
-; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v2, v0
+; GCN-IR-NEXT: v_mov_b32_e32 v3, v1
+; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v3
+; GCN-IR-NEXT: v_min_u32_e32 v6, v0, v1
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000
@@ -1192,45 +1162,41 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v5
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v12, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v0
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3
+; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
-; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
-; GCN-IR-NEXT: .LBB9_6: ; %Flow5
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v1
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v0
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB9_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mov_b32_e32 v0, v3
+; GCN-IR-NEXT: v_lshl_b64 v[1:2], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, v1
; GCN-IR-NEXT: v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT: .LBB9_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = udiv i64 32768, %x
ret i64 %result
@@ -1250,73 +1216,62 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 48, v6
-; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v4
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5]
-; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB10_6
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v1, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB10_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v6
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB10_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v6
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v7
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT: .LBB10_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v6
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7
-; GCN-IR-NEXT: v_and_b32_e32 v7, 0x8000, v7
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3
-; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2
-; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB10_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
-; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
-; GCN-IR-NEXT: .LBB10_6: ; %Flow5
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB10_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mov_b32_e32 v0, v3
-; GCN-IR-NEXT: v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT: .LBB10_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v0, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v1, v3
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = udiv i64 %x, 32768
ret i64 %result
@@ -1405,66 +1360,60 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-LABEL: s_test_udiv_k_den_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3
; GCN-IR-NEXT: s_add_i32 s6, s6, 32
-; GCN-IR-NEXT: s_min_u32 s10, s6, s7
-; GCN-IR-NEXT: s_sub_u32 s8, 59, s10
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3
-; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s12, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8
+; GCN-IR-NEXT: s_min_u32 s6, s6, s7
+; GCN-IR-NEXT: s_sub_u32 s10, 59, s6
+; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13]
+; GCN-IR-NEXT: s_and_b64 s[8:9], s[12:13], exec
+; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s12
-; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc4
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s11, s10, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s10
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s11
+; GCN-IR-NEXT: s_add_u32 s2, s6, 0xffffffc4
; GCN-IR-NEXT: s_addc_u32 s3, 0, -1
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
+; GCN-IR-NEXT: .LBB11_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, 23, s8
-; GCN-IR-NEXT: s_subb_u32 s4, 0, s9
-; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
-; GCN-IR-NEXT: s_and_b32 s4, s10, 1
-; GCN-IR-NEXT: s_and_b32 s10, s10, 24
-; GCN-IR-NEXT: s_sub_u32 s8, s8, s10
-; GCN-IR-NEXT: s_subb_u32 s9, s9, 0
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s4, 23, s10
+; GCN-IR-NEXT: s_subb_u32 s4, 0, s11
+; GCN-IR-NEXT: s_ashr_i32 s4, s4, 31
+; GCN-IR-NEXT: s_and_b32 s6, s4, 1
+; GCN-IR-NEXT: s_and_b32 s4, s4, 24
+; GCN-IR-NEXT: s_sub_u32 s10, s10, s4
+; GCN-IR-NEXT: s_subb_u32 s11, s11, 0
; GCN-IR-NEXT: s_add_u32 s2, s2, 1
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT: s_mov_b64 s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3
-; GCN-IR-NEXT: .LBB11_4: ; %Flow6
-; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
-; GCN-IR-NEXT: .LBB11_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT: s_cbranch_vccz .LBB11_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3]
+; GCN-IR-NEXT: .LBB11_4: ; %udiv-end
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
-; GCN-IR-NEXT: v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT: v_mov_b32_e32 v1, s9
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
%result = udiv i64 %x, 24
@@ -1551,72 +1500,61 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 59, v6
-; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_min_u32_e32 v4, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 59, v4
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5]
-; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
+; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v1, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB12_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 63, v6
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v6
+; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v7
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
+; GCN-IR-NEXT: .LBB12_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4
+; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7
-; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3
-; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2
-; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
-; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
-; GCN-IR-NEXT: .LBB12_6: ; %Flow5
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mov_b32_e32 v0, v3
-; GCN-IR-NEXT: v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2
+; GCN-IR-NEXT: .LBB12_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_mov_b32_e32 v0, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v1, v3
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = udiv i64 %x, 24
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 9c316612528c208..e91053384b3cec0 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -124,73 +124,66 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5
; GCN-IR-NEXT: s_add_i32 s10, s10, 32
-; GCN-IR-NEXT: s_add_i32 s6, s6, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT: s_min_u32 s10, s10, s11
-; GCN-IR-NEXT: s_min_u32 s14, s6, s7
-; GCN-IR-NEXT: s_sub_u32 s12, s10, s14
-; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT: s_flbit_i32_b32 s7, s2
+; GCN-IR-NEXT: s_min_u32 s6, s10, s11
+; GCN-IR-NEXT: s_add_i32 s7, s7, 32
+; GCN-IR-NEXT: s_flbit_i32_b32 s10, s3
+; GCN-IR-NEXT: s_min_u32 s10, s7, s10
+; GCN-IR-NEXT: s_sub_u32 s14, s6, s10
+; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17]
; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec
; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_mov_b32 s11, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s16, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
-; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s11, s14, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s14
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s11
; GCN-IR-NEXT: s_add_u32 s16, s4, -1
; GCN-IR-NEXT: s_addc_u32 s17, s5, -1
-; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s10, s6, s14
-; GCN-IR-NEXT: s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_not_b64 s[18:19], s[6:7]
+; GCN-IR-NEXT: s_add_u32 s10, s18, s10
+; GCN-IR-NEXT: s_addc_u32 s11, s19, 0
+; GCN-IR-NEXT: .LBB0_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1
; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s6, s16, s12
-; GCN-IR-NEXT: s_subb_u32 s6, s17, s13
-; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s6, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT: s_sub_u32 s12, s12, s14
-; GCN-IR-NEXT: s_subb_u32 s13, s13, s15
+; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT: s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31
+; GCN-IR-NEXT: s_mov_b32 s13, s12
+; GCN-IR-NEXT: s_and_b32 s6, s12, 1
+; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT: s_sub_u32 s14, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s15, s15, s13
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3
-; GCN-IR-NEXT: .LBB0_4: ; %Flow7
+; GCN-IR-NEXT: s_cbranch_vccz .LBB0_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT: .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT: .LBB0_4: ; %udiv-end
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0
; GCN-IR-NEXT: s_mov_b32 s12, s0
@@ -325,84 +318,73 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-IR-LABEL: v_test_urem_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
-; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v8, v9
-; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5]
-; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT: v_min_u32_e32 v4, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v8, v4
+; GCN-IR-NEXT: v_subb_u32_e64 v10, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[9:10]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_6
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB1_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v9
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v9
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v7, v8
+; GCN-IR-NEXT: v_not_b32_e32 v8, v8
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6
; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10
-; GCN-IR-NEXT: v_not_b32_e32 v6, 0
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v9
+; GCN-IR-NEXT: v_not_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v8, v4
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v7
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v11, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v6, v12, v6
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT: v_or_b32_e32 v7, v13, v7
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12
; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: .LBB1_6: ; %Flow5
+; GCN-IR-NEXT: v_mov_b32_e32 v13, v5
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v12, v4
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB1_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_or_b32_e32 v6, v4, v6
+; GCN-IR-NEXT: .LBB1_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7
; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6
@@ -812,74 +794,67 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-LABEL: s_test_urem_k_num_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
-; GCN-IR-NEXT: s_min_u32 s8, s8, s9
-; GCN-IR-NEXT: s_add_u32 s10, s8, 0xffffffc5
-; GCN-IR-NEXT: s_addc_u32 s11, 0, -1
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13]
-; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec
-; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24
+; GCN-IR-NEXT: s_min_u32 s10, s8, s9
+; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5
+; GCN-IR-NEXT: s_addc_u32 s9, 0, -1
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13]
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[12:13], exec
+; GCN-IR-NEXT: s_cselect_b32 s4, 0, 24
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT: s_mov_b32 s7, 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s12, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT: s_sub_i32 s9, 63, s10
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9
+; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s4, s8, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s4
; GCN-IR-NEXT: s_add_u32 s14, s2, -1
; GCN-IR-NEXT: s_addc_u32 s15, s3, -1
-; GCN-IR-NEXT: s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT: s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: .LBB6_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_sub_u32 s10, 58, s10
+; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT: .LBB6_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, s14, s10
-; GCN-IR-NEXT: s_subb_u32 s4, s15, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_and_b32 s4, s12, 1
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, s13
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT: s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT: s_ashr_i32 s6, s4, 31
+; GCN-IR-NEXT: s_mov_b32 s7, s6
+; GCN-IR-NEXT: s_and_b32 s4, s6, 1
+; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], s[2:3]
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s6
+; GCN-IR-NEXT: s_subb_u32 s13, s13, s7
+; GCN-IR-NEXT: s_add_u32 s10, s10, 1
+; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 0
+; GCN-IR-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB6_3
-; GCN-IR-NEXT: .LBB6_4: ; %Flow6
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-IR-NEXT: .LBB6_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT: s_cbranch_vccz .LBB6_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT: .LBB6_4: ; %udiv-end
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-IR-NEXT: s_mov_b32 s8, s0
-; GCN-IR-NEXT: s_mul_i32 s0, s2, s7
+; GCN-IR-NEXT: s_mul_i32 s0, s2, s5
; GCN-IR-NEXT: s_mov_b32 s11, 0xf000
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT: s_mul_i32 s0, s3, s6
+; GCN-IR-NEXT: s_mul_i32 s0, s3, s4
; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0
-; GCN-IR-NEXT: s_mul_i32 s0, s2, s6
+; GCN-IR-NEXT: s_mul_i32 s0, s2, s4
; GCN-IR-NEXT: v_sub_i32_e64 v0, vcc, 24, s0
; GCN-IR-NEXT: s_mov_b32 s10, -1
; GCN-IR-NEXT: s_mov_b32 s9, s1
@@ -972,75 +947,69 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-LABEL: s_test_urem_k_den_i64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2
; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3
; GCN-IR-NEXT: s_add_i32 s6, s6, 32
-; GCN-IR-NEXT: s_min_u32 s8, s6, s7
-; GCN-IR-NEXT: s_sub_u32 s10, 59, s8
+; GCN-IR-NEXT: s_min_u32 s6, s6, s7
+; GCN-IR-NEXT: s_sub_u32 s10, 59, s6
; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[10:11], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 63
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3
-; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13]
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s12, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
-; GCN-IR-NEXT: s_sub_i32 s9, 63, s10
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s9
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13]
+; GCN-IR-NEXT: s_and_b64 s[8:9], s[12:13], exec
+; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT: s_mov_b32 s7, 0
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s12
-; GCN-IR-NEXT: s_add_u32 s8, s8, 0xffffffc4
-; GCN-IR-NEXT: s_addc_u32 s9, 0, -1
-; GCN-IR-NEXT: s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT: s_mov_b32 s5, 0
-; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while
+; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT: s_add_i32 s11, s10, 1
+; GCN-IR-NEXT: s_sub_i32 s8, 63, s10
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s11
+; GCN-IR-NEXT: s_add_u32 s10, s6, 0xffffffc4
+; GCN-IR-NEXT: s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT: .LBB7_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s4, 23, s10
-; GCN-IR-NEXT: s_subb_u32 s4, 0, s11
-; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT: s_and_b32 s4, s12, 1
-; GCN-IR-NEXT: s_and_b32 s12, s12, 24
-; GCN-IR-NEXT: s_sub_u32 s10, s10, s12
-; GCN-IR-NEXT: s_subb_u32 s11, s11, 0
-; GCN-IR-NEXT: s_add_u32 s8, s8, 1
-; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31
+; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s4, 23, s12
+; GCN-IR-NEXT: s_subb_u32 s4, 0, s13
+; GCN-IR-NEXT: s_ashr_i32 s4, s4, 31
+; GCN-IR-NEXT: s_and_b32 s6, s4, 1
+; GCN-IR-NEXT: s_and_b32 s4, s4, 24
+; GCN-IR-NEXT: s_sub_u32 s12, s12, s4
+; GCN-IR-NEXT: s_subb_u32 s13, s13, 0
+; GCN-IR-NEXT: s_add_u32 s10, s10, 1
+; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
+; GCN-IR-NEXT: s_mov_b64 s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3
-; GCN-IR-NEXT: .LBB7_4: ; %Flow6
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-IR-NEXT: .LBB7_5: ; %udiv-end
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, 24
-; GCN-IR-NEXT: s_mov_b32 s8, s0
-; GCN-IR-NEXT: s_mul_i32 s0, s7, 24
+; GCN-IR-NEXT: s_cbranch_vccz .LBB7_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
+; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[8:9], 1
+; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5]
+; GCN-IR-NEXT: .LBB7_4: ; %udiv-end
+; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, 24
+; GCN-IR-NEXT: s_mov_b32 s4, s0
+; GCN-IR-NEXT: s_mul_i32 s0, s9, 24
; GCN-IR-NEXT: v_mov_b32_e32 v2, s3
; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0
-; GCN-IR-NEXT: s_mul_i32 s0, s6, 24
+; GCN-IR-NEXT: s_mul_i32 s0, s8, 24
; GCN-IR-NEXT: v_mov_b32_e32 v0, s0
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT: s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s10, -1
-; GCN-IR-NEXT: s_mov_b32 s9, s1
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s6, -1
+; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%result = urem i64 %x, 24
store i64 %result, ptr addrspace(1) %out
@@ -1154,8 +1123,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
@@ -1182,52 +1151,50 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v2
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT: .LBB8_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v12, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v13, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB8_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7
-; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6
-; GCN-IR-NEXT: .LBB8_6: ; %Flow5
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v3
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB8_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2
-; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5
-; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5
-; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v2, v3
+; GCN-IR-NEXT: v_mov_b32_e32 v3, v4
+; GCN-IR-NEXT: .LBB8_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3
+; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2
+; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2
+; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
@@ -1249,71 +1216,60 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v6
-; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GCN-IR-NEXT: v_min_u32_e32 v2, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v2
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
+; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7]
+; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], -1
+; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5]
; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5]
-; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc
-; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB9_6
+; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], vcc
+; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT: s_cbranch_execz .LBB9_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v2
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB9_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v6
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v2
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
-; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT: .LBB9_2: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v8
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v2
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v10
; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v3
; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
-; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %Flow4
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
-; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
-; GCN-IR-NEXT: .LBB9_6: ; %Flow5
+; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v2
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_cbranch_execnz .LBB9_2
+; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_or_b32_e32 v4, v2, v4
+; GCN-IR-NEXT: .LBB9_4: ; %Flow
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
index 77d861ad0599c18..ee911e001b79def 100644
--- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -262,10 +262,8 @@ exit:
; for.latch
; for.check
; test1
-; test2
; test3
; test4
-; optional1
; optional2
; optional3
; optional4
@@ -282,9 +280,6 @@ exit:
;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
;CHECK-O3: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
;CHECK: # %bb.{{[0-9]+}}: # %test1
-;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: # %test2
;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 2
;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
@@ -294,10 +289,7 @@ exit:
;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 8
;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK: [[OPT1LABEL]]
-;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 2
-;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
-;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: .[[OPT2LABEL]]
;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 4
;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
;CHECK-NEXT: .[[OPT3LABEL]]
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 1c95d28b5eed1be..70a619c37bf2517 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -997,14 +997,12 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: cmp r3, #8
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: blo.w .LBB16_12
+; CHECK-NEXT: blo.w .LBB16_11
; CHECK-NEXT: @ %bb.1: @ %if.then
-; CHECK-NEXT: lsrs.w r12, r3, #2
-; CHECK-NEXT: beq.w .LBB16_12
-; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
; CHECK-NEXT: ldrh r4, [r0]
; CHECK-NEXT: movs r1, #1
-; CHECK-NEXT: ldrd r5, r3, [r0, #4]
+; CHECK-NEXT: ldrd r5, r12, [r0, #4]
+; CHECK-NEXT: lsr.w r9, r3, #2
; CHECK-NEXT: sub.w r0, r4, #8
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
@@ -1017,7 +1015,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: subs r1, r7, #2
; CHECK-NEXT: rsbs r7, r4, #0
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: add.w r7, r3, #16
+; CHECK-NEXT: add.w r7, r12, #16
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
@@ -1035,7 +1033,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: subs.w r12, r12, #1
+; CHECK-NEXT: subs.w r9, r9, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
@@ -1045,15 +1043,15 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: ldrh.w lr, [r3, #14]
+; CHECK-NEXT: ldrh.w lr, [r12, #14]
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
-; CHECK-NEXT: ldrh.w r8, [r3, #12]
-; CHECK-NEXT: ldrh r7, [r3, #10]
-; CHECK-NEXT: ldrh r4, [r3, #8]
-; CHECK-NEXT: ldrh r6, [r3, #6]
-; CHECK-NEXT: ldrh.w r9, [r3, #4]
-; CHECK-NEXT: ldrh.w r11, [r3, #2]
-; CHECK-NEXT: ldrh.w r10, [r3]
+; CHECK-NEXT: ldrh.w r8, [r12, #12]
+; CHECK-NEXT: ldrh.w r7, [r12, #10]
+; CHECK-NEXT: ldrh.w r4, [r12, #8]
+; CHECK-NEXT: ldrh.w r3, [r12, #6]
+; CHECK-NEXT: ldrh.w r6, [r12, #4]
+; CHECK-NEXT: ldrh.w r11, [r12, #2]
+; CHECK-NEXT: ldrh.w r10, [r12]
; CHECK-NEXT: vstrb.8 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q0, [r5]
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
@@ -1063,10 +1061,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: adds r0, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r11
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
-; CHECK-NEXT: vfma.f16 q0, q1, r9
+; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: add.w r0, r5, #10
-; CHECK-NEXT: vfma.f16 q0, q1, r6
+; CHECK-NEXT: vfma.f16 q0, q1, r3
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: vldrw.u32 q1, [r0]
@@ -1090,25 +1088,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r0, [r6], #16
; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: adds r4, r5, #2
+; CHECK-NEXT: adds r3, r5, #2
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: ldrh r0, [r6, #-14]
-; CHECK-NEXT: adds r4, r5, #6
+; CHECK-NEXT: adds r3, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: ldrh r0, [r6, #-12]
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: ldrh r0, [r6, #-10]
-; CHECK-NEXT: add.w r4, r5, #10
+; CHECK-NEXT: add.w r3, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: ldrh r0, [r6, #-8]
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: ldrh r0, [r6, #-6]
-; CHECK-NEXT: ldrh r4, [r6, #-2]
+; CHECK-NEXT: ldrh r3, [r6, #-2]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: ldrh r0, [r6, #-4]
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
@@ -1128,7 +1126,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: .LBB16_11: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r4, [r6], #2
+; CHECK-NEXT: ldrh r3, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_11
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 808626d9a0aebe6..332453360a752c2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -983,12 +983,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-LABEL: fir:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #8
-; CHECK-NEXT: blo.w .LBB16_13
-; CHECK-NEXT: @ %bb.1: @ %if.then
-; CHECK-NEXT: lsrs.w r12, r3, #2
-; CHECK-NEXT: it eq
-; CHECK-NEXT: bxeq lr
-; CHECK-NEXT: .LBB16_2: @ %while.body.lr.ph
+; CHECK-NEXT: it lo
+; CHECK-NEXT: bxlo lr
+; CHECK-NEXT: .LBB16_1: @ %if.then
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
@@ -997,24 +994,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
-; CHECK-NEXT: ldrh r6, [r0]
-; CHECK-NEXT: movs r5, #1
-; CHECK-NEXT: ldrd r4, r10, [r0, #4]
-; CHECK-NEXT: sub.w r0, r6, #8
-; CHECK-NEXT: add.w r3, r0, r0, lsr #29
+; CHECK-NEXT: ldrh r5, [r0]
+; CHECK-NEXT: movs r6, #1
+; CHECK-NEXT: ldrd r4, r12, [r0, #4]
+; CHECK-NEXT: lsr.w r10, r3, #2
+; CHECK-NEXT: sub.w r0, r5, #8
+; CHECK-NEXT: add.w r7, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
-; CHECK-NEXT: asrs r7, r3, #3
-; CHECK-NEXT: cmp r7, #1
+; CHECK-NEXT: asr.w lr, r7, #3
+; CHECK-NEXT: cmp.w lr, #1
; CHECK-NEXT: it gt
-; CHECK-NEXT: asrgt r5, r3, #3
-; CHECK-NEXT: add.w r3, r4, r6, lsl #2
-; CHECK-NEXT: sub.w r9, r3, #4
-; CHECK-NEXT: rsbs r3, r6, #0
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r3, r10, #32
-; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: asrgt r6, r7, #3
+; CHECK-NEXT: add.w r7, r4, r5, lsl #2
+; CHECK-NEXT: sub.w r9, r7, #4
+; CHECK-NEXT: rsbs r7, r5, #0
+; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r7, r12, #32
+; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
@@ -1031,7 +1029,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: subs.w r12, r12, #1
+; CHECK-NEXT: subs.w r10, r10, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: add.w r0, r4, r0, lsl #2
; CHECK-NEXT: add.w r4, r0, #16
@@ -1042,24 +1040,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
-; CHECK-NEXT: ldrd r3, r7, [r10]
-; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
-; CHECK-NEXT: ldrd r11, r8, [r10, #24]
+; CHECK-NEXT: ldrd r7, r6, [r12]
+; CHECK-NEXT: ldrd r0, r5, [r12, #8]
+; CHECK-NEXT: ldrd r3, lr, [r12, #16]
+; CHECK-NEXT: ldrd r11, r8, [r12, #24]
; CHECK-NEXT: vstrb.8 q0, [r9], #16
; CHECK-NEXT: vldrw.u32 q0, [r4], #32
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT: vmul.f32 q0, q0, r3
+; CHECK-NEXT: vmul.f32 q0, q0, r7
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT: vfma.f32 q0, q1, r7
+; CHECK-NEXT: vfma.f32 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
; CHECK-NEXT: vfma.f32 q0, q6, r0
; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q4, r5
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
-; CHECK-NEXT: vfma.f32 q0, q5, r6
+; CHECK-NEXT: vfma.f32 q0, q5, r3
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q2, lr
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
@@ -1106,7 +1105,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr r0, [r7], #4
-; CHECK-NEXT: vldrw.u32 q1, [r3], #4
+; CHECK-NEXT: vldrw.u32 q1, [r6], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: le lr, .LBB16_11
; CHECK-NEXT: b .LBB16_3
@@ -1115,7 +1114,6 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .LBB16_13: @ %if.end
; CHECK-NEXT: bx lr
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
>From 6a68fd9597152d5243133e32051fddeecc08d5af Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 22 Oct 2023 14:19:08 +0800
Subject: [PATCH 4/5] [ValueTracking] Add tests from PR69038. NFC.
---
.../Analysis/ValueTracking/implied-icmp-binop.ll | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
index 882c38f329bd884..b9cc3f9a1b1615c 100644
--- a/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-icmp-binop.ll
@@ -191,3 +191,16 @@ entry:
%and = and i1 %cmp, %cmp2
ret i1 %and
}
+
+define i1 @pr69038(i32 %a, i32 %b) {
+; CHECK-LABEL: define i1 @pr69038(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT: ret i1 [[TOBOOL]]
+;
+ %tobool = icmp ne i32 %a, 0
+ %or = or i32 %a, %b
+ %tobool1 = icmp ne i32 %or, 0
+ %and = and i1 %tobool, %tobool1
+ ret i1 %and
+}
>From 374533a08d764db6a37f8f70760f3d31f0acb65b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 4 Nov 2023 18:05:54 +0800
Subject: [PATCH 5/5] fixup! [ValueTracking] Improve `isImpliedCondICmps` to
handle binops
Rebase to resolve conflicts
---
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 76 +++++++++----------
llvm/test/CodeGen/AMDGPU/srem64.ll | 42 +++++-----
llvm/test/CodeGen/AMDGPU/udiv64.ll | 50 +++++-------
llvm/test/CodeGen/AMDGPU/urem64.ll | 38 ++++------
llvm/test/CodeGen/PowerPC/reduce_cr.ll | 4 +-
.../CodeGen/Thumb2/mve-float16regloops.ll | 64 ++++++++--------
.../CodeGen/Thumb2/mve-float32regloops.ll | 63 ++++++++-------
7 files changed, 152 insertions(+), 185 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 950e8c60ef9d01f..cdb1930658e4f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -371,12 +371,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
+; GCN-IR-NEXT: v_min_u32_e32 v12, v2, v3
+; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
+; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v11
; GCN-IR-NEXT: v_min_u32_e32 v2, v2, v3
-; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v10
-; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3
-; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v11
-; GCN-IR-NEXT: v_min_u32_e32 v12, v3, v8
-; GCN-IR-NEXT: v_sub_i32_e32 v13, vcc, v2, v12
+; GCN-IR-NEXT: v_sub_i32_e32 v13, vcc, v12, v2
; GCN-IR-NEXT: v_subb_u32_e64 v14, s[8:9], 0, 0, vcc
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[13:14]
; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
@@ -395,13 +395,13 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v13
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 63, v13
; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v2, v2
; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[10:11], v8
; GCN-IR-NEXT: v_lshr_b64 v[13:14], v[10:11], v14
+; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, -1, v1, vcc
+; GCN-IR-NEXT: v_not_b32_e32 v10, v12
; GCN-IR-NEXT: v_not_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v15, 0
-; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v2, v12
+; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v10, v2
; GCN-IR-NEXT: v_mov_b32_e32 v16, 0
; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while
@@ -1539,44 +1539,36 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
-; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
-; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5
+; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8
-; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6]
-; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, s6, v8
+; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[4:5]
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7]
+; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7]
+; GCN-IR-NEXT: v_mov_b32_e32 v0, 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v7, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GCN-IR-NEXT: v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
+; GCN-IR-NEXT: s_cbranch_execz .LBB12_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5
-; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 63, v6
+; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v4
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
-; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v9
-; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8
+; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v5, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[6:7], s[4:5], v6
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v0
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8
; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
@@ -1601,18 +1593,18 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v13, v1
-; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GCN-IR-NEXT: v_mov_b32_e32 v12, v0
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2
; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[6:7], 1
; GCN-IR-NEXT: v_or_b32_e32 v4, v0, v4
; GCN-IR-NEXT: v_mov_b32_e32 v0, v4
; GCN-IR-NEXT: v_mov_b32_e32 v1, v5
; GCN-IR-NEXT: .LBB12_4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2
; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v3
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 0191930aabba5a1..90ee3a3da39e0cc 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1725,34 +1725,26 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
-; GCN-IR-NEXT: s_movk_i32 s8, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s8, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT: v_mov_b32_e32 v2, 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
+; GCN-IR-NEXT: s_cbranch_execz .LBB12_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v2
@@ -1780,18 +1772,18 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v11, v3
-; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GCN-IR-NEXT: v_mov_b32_e32 v10, v2
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_2
; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3
; GCN-IR-NEXT: v_mov_b32_e32 v2, v3
; GCN-IR-NEXT: v_mov_b32_e32 v3, v4
; GCN-IR-NEXT: .LBB12_4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3
; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 6fc0e90b7724460..eb996692e99ce07 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -315,12 +315,12 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v6, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v6, v8
+; GCN-IR-NEXT: v_min_u32_e32 v6, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v8, v6
; GCN-IR-NEXT: v_subb_u32_e64 v10, s[8:9], 0, 0, vcc
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[9:10]
; GCN-IR-NEXT: s_mov_b64 s[6:7], 0
@@ -340,10 +340,10 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[0:1], v10
; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v0, v6
+; GCN-IR-NEXT: v_not_b32_e32 v0, v8
; GCN-IR-NEXT: v_not_b32_e32 v1, 0
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-IR-NEXT: .LBB1_2: ; %udiv-do-while
@@ -1132,35 +1132,27 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v3
; GCN-IR-NEXT: v_min_u32_e32 v6, v0, v1
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000
+; GCN-IR-NEXT: v_mov_b32_e32 v0, 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB9_6
+; GCN-IR-NEXT: s_cbranch_execz .LBB9_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4
+; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2
; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT: s_cbranch_execz .LBB9_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
+; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v0
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
@@ -1185,18 +1177,18 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v11, v1
-; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GCN-IR-NEXT: v_mov_b32_e32 v10, v0
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_2
; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[1:2], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1
; GCN-IR-NEXT: v_mov_b32_e32 v0, v1
; GCN-IR-NEXT: v_mov_b32_e32 v1, v2
; GCN-IR-NEXT: .LBB9_4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-IR-NEXT: s_setpc_b64 s[30:31]
%result = udiv i64 32768, %x
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index e91053384b3cec0..6264203099699ab 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1124,32 +1124,24 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v6
-; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000
+; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
+; GCN-IR-NEXT: v_mov_b32_e32 v2, 0x8000
; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5]
+; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT: v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-IR-NEXT: s_cbranch_execz .LBB8_6
+; GCN-IR-NEXT: s_cbranch_execz .LBB8_4
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
-; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GCN-IR-NEXT: s_cbranch_execz .LBB8_5
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 63, v4
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v2
@@ -1177,18 +1169,18 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
; GCN-IR-NEXT: v_mov_b32_e32 v11, v3
-; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GCN-IR-NEXT: v_mov_b32_e32 v10, v2
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB8_2
; GCN-IR-NEXT: ; %bb.3: ; %udiv-loop-exit
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v3, v2, v3
; GCN-IR-NEXT: v_mov_b32_e32 v2, v3
; GCN-IR-NEXT: v_mov_b32_e32 v3, v4
; GCN-IR-NEXT: .LBB8_4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v3
; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2
diff --git a/llvm/test/CodeGen/PowerPC/reduce_cr.ll b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
index 7491d13c5301015..e5761de12670ff9 100644
--- a/llvm/test/CodeGen/PowerPC/reduce_cr.ll
+++ b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
@@ -7,7 +7,7 @@ target triple = "powerpc64le-grtev4-linux-gnu"
;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}}
;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = {{.*}}
-;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = {{.*}}
+;CHECK-NEXT: - BB3[optional2]: float = 0.625, int = {{.*}}
;CHECK: block-frequency-info: loop_test
;CHECK: block-frequency-info: loop_test
@@ -19,7 +19,7 @@ target triple = "powerpc64le-grtev4-linux-gnu"
;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = {{.*}}
;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = {{.*}}
-;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = {{.*}}
+;CHECK-NEXT: - BB4[optional2]: float = 0.625, int = {{.*}}
define void @loop_test(ptr %tags, i32 %count) {
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 70a619c37bf2517..84ef6b1a02750f7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1019,29 +1019,29 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: b .LBB16_6
-; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: b .LBB16_5
+; CHECK-NEXT: .LBB16_2: @ %while.end.loopexit
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r5, r5, r0, lsl #1
-; CHECK-NEXT: b .LBB16_5
-; CHECK-NEXT: .LBB16_4: @ %for.end
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: b .LBB16_4
+; CHECK-NEXT: .LBB16_3: @ %for.end
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: wls lr, r0, .LBB16_5
-; CHECK-NEXT: b .LBB16_10
-; CHECK-NEXT: .LBB16_5: @ %while.end
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: wls lr, r0, .LBB16_4
+; CHECK-NEXT: b .LBB16_9
+; CHECK-NEXT: .LBB16_4: @ %while.end
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: subs.w r9, r9, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
-; CHECK-NEXT: beq.w .LBB16_12
-; CHECK-NEXT: .LBB16_6: @ %while.body
+; CHECK-NEXT: beq.w .LBB16_11
+; CHECK-NEXT: .LBB16_5: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
-; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
-; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
+; CHECK-NEXT: @ Child Loop BB16_7 Depth 2
+; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldrh.w lr, [r12, #14]
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
@@ -1077,14 +1077,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
-; CHECK-NEXT: blo .LBB16_9
-; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: blo .LBB16_8
+; CHECK-NEXT: @ %bb.6: @ %for.body.preheader
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: .LBB16_8: @ %for.body
-; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT: .LBB16_7: @ %for.body
+; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r0, [r6], #16
; CHECK-NEXT: vldrw.u32 q1, [r5]
@@ -1114,24 +1114,24 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: adds r5, #16
-; CHECK-NEXT: vfma.f16 q0, q1, r4
-; CHECK-NEXT: le lr, .LBB16_8
-; CHECK-NEXT: b .LBB16_4
-; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: vfma.f16 q0, q1, r3
+; CHECK-NEXT: le lr, .LBB16_7
+; CHECK-NEXT: b .LBB16_3
+; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: b .LBB16_4
-; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: b .LBB16_3
+; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: mov r0, r5
-; CHECK-NEXT: .LBB16_11: @ %while.body76
-; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT: .LBB16_10: @ %while.body76
+; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r3, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
-; CHECK-NEXT: vfma.f16 q0, q1, r4
-; CHECK-NEXT: le lr, .LBB16_11
-; CHECK-NEXT: b .LBB16_3
-; CHECK-NEXT: .LBB16_12: @ %if.end
+; CHECK-NEXT: vfma.f16 q0, q1, r3
+; CHECK-NEXT: le lr, .LBB16_10
+; CHECK-NEXT: b .LBB16_2
+; CHECK-NEXT: .LBB16_11: @ %if.end
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 332453360a752c2..394a8ba8f53d09f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1014,31 +1014,30 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: b .LBB16_6
-; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: b .LBB16_5
+; CHECK-NEXT: .LBB16_2: @ %while.end.loopexit
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add.w r4, r4, r0, lsl #2
-; CHECK-NEXT: b .LBB16_5
-; CHECK-NEXT: .LBB16_4: @ %for.end
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: b .LBB16_4
+; CHECK-NEXT: .LBB16_3: @ %for.end
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
-; CHECK-NEXT: wls lr, r0, .LBB16_5
-; CHECK-NEXT: b .LBB16_10
-; CHECK-NEXT: .LBB16_5: @ %while.end
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: wls lr, r0, .LBB16_4
+; CHECK-NEXT: b .LBB16_9
+; CHECK-NEXT: .LBB16_4: @ %while.end
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: subs.w r10, r10, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: add.w r0, r4, r0, lsl #2
; CHECK-NEXT: add.w r4, r0, #16
-; CHECK-NEXT: beq .LBB16_12
-; CHECK-NEXT: .LBB16_6: @ %while.body
+; CHECK-NEXT: beq .LBB16_11
+; CHECK-NEXT: .LBB16_5: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
-; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
-; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT: add.w lr, r10, #8
+; CHECK-NEXT: @ Child Loop BB16_7 Depth 2
+; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: ldrd r7, r6, [r12]
; CHECK-NEXT: ldrd r0, r5, [r12, #8]
@@ -1065,14 +1064,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r8
-; CHECK-NEXT: blo .LBB16_9
-; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: blo .LBB16_8
+; CHECK-NEXT: @ %bb.6: @ %for.body.preheader
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: .LBB16_8: @ %for.body
-; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT: .LBB16_7: @ %for.body
+; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
; CHECK-NEXT: vldrw.u32 q1, [r4], #32
@@ -1093,23 +1092,23 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: vfma.f32 q0, q2, r11
; CHECK-NEXT: vfma.f32 q0, q3, r9
; CHECK-NEXT: vfma.f32 q0, q1, r1
-; CHECK-NEXT: le lr, .LBB16_8
-; CHECK-NEXT: b .LBB16_4
-; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT: le lr, .LBB16_7
+; CHECK-NEXT: b .LBB16_3
+; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: b .LBB16_4
-; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
-; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: mov r3, r4
-; CHECK-NEXT: .LBB16_11: @ %while.body76
-; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT: b .LBB16_3
+; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT: mov r6, r4
+; CHECK-NEXT: .LBB16_10: @ %while.body76
+; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr r0, [r7], #4
; CHECK-NEXT: vldrw.u32 q1, [r6], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
-; CHECK-NEXT: le lr, .LBB16_11
-; CHECK-NEXT: b .LBB16_3
-; CHECK-NEXT: .LBB16_12:
+; CHECK-NEXT: le lr, .LBB16_10
+; CHECK-NEXT: b .LBB16_2
+; CHECK-NEXT: .LBB16_11:
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
More information about the llvm-commits
mailing list