[llvm] [GISel] Combine out-of-range shifts with value to 0 or -1 (PR #123510)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 02:35:18 PST 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/123510
>From 63c7775f88cb90796ad2b69c44ce756591cb3545 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 12:42:52 +0800
Subject: [PATCH 1/8] First commit
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 28 +++++------
.../include/llvm/Target/GlobalISel/Combine.td | 13 +++--
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 49 ++++++++++++++++++-
3 files changed, 70 insertions(+), 20 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 94e36e412b0cf7..da558de5946559 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -115,18 +115,13 @@ class CombinerHelper {
public:
CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
- bool IsPreLegalize,
- GISelKnownBits *KB = nullptr,
+ bool IsPreLegalize, GISelKnownBits *KB = nullptr,
MachineDominatorTree *MDT = nullptr,
const LegalizerInfo *LI = nullptr);
- GISelKnownBits *getKnownBits() const {
- return KB;
- }
+ GISelKnownBits *getKnownBits() const { return KB; }
- MachineIRBuilder &getBuilder() const {
- return Builder;
- }
+ MachineIRBuilder &getBuilder() const { return Builder; }
const TargetLowering &getTargetLowering() const;
@@ -150,8 +145,10 @@ class CombinerHelper {
/// is a legal integer constant type on the target.
bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
- /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
- void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
+ /// MachineRegisterInfo::replaceRegWith() and inform the observer of the
+ /// changes
+ void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
+ Register ToReg) const;
/// Replace a single register operand with a new register and inform the
/// observer of the changes.
@@ -482,12 +479,12 @@ class CombinerHelper {
bool matchEqualDefs(const MachineOperand &MOP1,
const MachineOperand &MOP2) const;
- /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value equal to
- /// \p C.
+ /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value
+ /// equal to \p C.
bool matchConstantOp(const MachineOperand &MOP, int64_t C) const;
- /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value exactly
- /// equal to \p C.
+ /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value
+ /// exactly equal to \p C.
bool matchConstantFPOp(const MachineOperand &MOP, double C) const;
/// @brief Checks if constant at \p ConstIdx is larger than \p MI 's bitwidth
@@ -841,7 +838,8 @@ class CombinerHelper {
BuildFnTy &MatchInfo) const;
/// Match shifts greater or equal to the bitwidth of the operation.
- bool matchShiftsTooBig(MachineInstr &MI) const;
+ bool matchShiftsTooBig(MachineInstr &MI,
+ std::optional<int64_t> &MatchInfo) const;
/// Match constant LHS ops that should be commuted.
bool matchCommuteConstantToRHS(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8641eabbdd84c6..ae0856550d9356 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,11 +306,18 @@ def ptr_add_immed_chain : GICombineRule<
[{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
(apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
+def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
def shifts_too_big : GICombineRule<
- (defs root:$root),
+ (defs root:$root, shift_result_matchdata:$matchinfo),
(match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
- [{ return Helper.matchShiftsTooBig(*${root}); }]),
- (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
+ [{ return Helper.matchShiftsTooBig(*${root}, ${matchinfo}); }]),
+ (apply [{
+ if (${matchinfo}) {
+ Helper.replaceInstWithConstant(*${root}, *${matchinfo});
+ } else {
+ Helper.replaceInstWithUndef(*${root});
+ }
+ }])>;
// Fold shift (shift base x), y -> shift base, (x+y), if shifts are same
def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da7198c..6c04337ad73e0b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -35,6 +35,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/DivisionByConstantInfo.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <cmath>
@@ -6590,12 +6591,56 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
return CmpInst::isEquality(Pred) && Y.isValid();
}
-bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) const {
+static std::optional<unsigned>
+getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
+ std::optional<int64_t> &Result) {
+ assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+ Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
+ auto SignificantBits = 0;
+ switch (Opcode) {
+ case TargetOpcode::G_SHL:
+ SignificantBits = ValueKB.countMinTrailingZeros();
+ Result = 0;
+ break;
+ case TargetOpcode::G_LSHR:
+ Result = 0;
+ SignificantBits = ValueKB.countMinLeadingZeros();
+ break;
+ case TargetOpcode::G_ASHR:
+ if (ValueKB.isNonNegative()) {
+ SignificantBits = ValueKB.countMinLeadingZeros();
+ Result = 0;
+ } else if (ValueKB.isNegative()) {
+ SignificantBits = ValueKB.countMinLeadingOnes();
+ Result = -1;
+ } else {
+ // Cannot determine shift result.
+ Result = std::nullopt;
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+ return ValueKB.getBitWidth() - SignificantBits;
+}
+
+bool CombinerHelper::matchShiftsTooBig(
+ MachineInstr &MI, std::optional<int64_t> &MatchInfo) const {
+ Register ShiftVal = MI.getOperand(1).getReg();
Register ShiftReg = MI.getOperand(2).getReg();
LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
auto IsShiftTooBig = [&](const Constant *C) {
auto *CI = dyn_cast<ConstantInt>(C);
- return CI && CI->uge(ResTy.getScalarSizeInBits());
+ if (!CI)
+ return false;
+ if (CI->uge(ResTy.getScalarSizeInBits())) {
+ MatchInfo = std::nullopt;
+ return true;
+ }
+ auto OptMaxUsefulShift = getMaxUsefulShift(KB->getKnownBits(ShiftVal),
+ MI.getOpcode(), MatchInfo);
+ return OptMaxUsefulShift && CI->uge(*OptMaxUsefulShift);
};
return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
}
>From 56dfed9473140716806cfb41ed7371d5cb1e0030 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 20:29:57 +0800
Subject: [PATCH 2/8] Fix up
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 28 +++++++++++--------
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 -
2 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index da558de5946559..a51aa876e1deb0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -115,13 +115,18 @@ class CombinerHelper {
public:
CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
- bool IsPreLegalize, GISelKnownBits *KB = nullptr,
+ bool IsPreLegalize,
+ GISelKnownBits *KB = nullptr,
MachineDominatorTree *MDT = nullptr,
const LegalizerInfo *LI = nullptr);
- GISelKnownBits *getKnownBits() const { return KB; }
+ GISelKnownBits *getKnownBits() const {
+ return KB;
+ }
- MachineIRBuilder &getBuilder() const { return Builder; }
+ MachineIRBuilder &getBuilder() const {
+ return Builder;
+ }
const TargetLowering &getTargetLowering() const;
@@ -145,10 +150,8 @@ class CombinerHelper {
/// is a legal integer constant type on the target.
bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
- /// MachineRegisterInfo::replaceRegWith() and inform the observer of the
- /// changes
- void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
- Register ToReg) const;
+ /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
+ void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
/// Replace a single register operand with a new register and inform the
/// observer of the changes.
@@ -479,12 +482,12 @@ class CombinerHelper {
bool matchEqualDefs(const MachineOperand &MOP1,
const MachineOperand &MOP2) const;
- /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value
- /// equal to \p C.
+ /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value equal to
+ /// \p C.
bool matchConstantOp(const MachineOperand &MOP, int64_t C) const;
- /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value
- /// exactly equal to \p C.
+ /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value exactly
+ /// equal to \p C.
bool matchConstantFPOp(const MachineOperand &MOP, double C) const;
/// @brief Checks if constant at \p ConstIdx is larger than \p MI 's bitwidth
@@ -837,7 +840,8 @@ class CombinerHelper {
bool matchRedundantBinOpInEquality(MachineInstr &MI,
BuildFnTy &MatchInfo) const;
- /// Match shifts greater or equal to the bitwidth of the operation.
+ /// Match shifts greater or equal to the range (bitwidth of the operation, or
+ /// the source value).
bool matchShiftsTooBig(MachineInstr &MI,
std::optional<int64_t> &MatchInfo) const;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 6c04337ad73e0b..b4cf1655b04728 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -35,7 +35,6 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/DivisionByConstantInfo.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <cmath>
>From ce31b1f5be7b6f36033e539e03b4d2d44972151d Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 22:14:39 +0800
Subject: [PATCH 3/8] adding AMDGPU tests
---
.../AMDGPU/GlobalISel/combine-shifts.mir | 114 ++++++++++++++++++
1 file changed, 114 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
new file mode 100644
index 00000000000000..157ab5d7bb03dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: combine_ashr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_ashr
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 10
+ %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %13:_(s32) = G_ASHR %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_lshr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_lshr
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 10
+ %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %13:_(s32) = G_LSHR %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_shl
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_shl
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 16
+ %11:_(s32) = G_CONSTANT i32 4294901760
+ %13:_(s32) = G_SHL %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_ashr2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_ashr2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 16
+ %11:_(s8) = G_CONSTANT i8 -126
+ G_STORE %13(s8), %0(p0) :: (store (s8))
+ SI_RETURN
+
+...
>From d0043990b2e248559a3ee38a735225ae532152d5 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 22:20:23 +0800
Subject: [PATCH 4/8] Adding new tests
---
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
index 157ab5d7bb03dd..d3a3827f35a18a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -100,14 +100,15 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -1
+ ; CHECK-NEXT: G_STORE [[C]](s8), [[MV]](p0) :: (store (s8))
; CHECK-NEXT: SI_RETURN
%9:_(s32) = COPY $vgpr0
%10:_(s32) = COPY $vgpr1
%0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
- %12:_(s32) = G_CONSTANT i32 16
- %11:_(s8) = G_CONSTANT i8 -126
+ %12:_(s32) = G_CONSTANT i32 1
+ %11:_(s8) = G_CONSTANT i8 -2
+ %13:_(s8) = G_ASHR %11, %12(s32)
G_STORE %13(s8), %0(p0) :: (store (s8))
SI_RETURN
>From ec1618f69f72ee0edbbec93664ee141ff55dbd7d Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 22:50:46 +0800
Subject: [PATCH 5/8] updates
---
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b4cf1655b04728..4821c21301946b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6615,7 +6615,6 @@ getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
} else {
// Cannot determine shift result.
Result = std::nullopt;
- return false;
}
break;
default:
>From 12fb70efe614b35a86c9b7880a42770be0b71a3f Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 20 Jan 2025 01:19:05 +0800
Subject: [PATCH 6/8] Fix AMDGPU global isel test failures
---
...mbine-shl-from-extend-narrow.postlegal.mir | 16 +-
...ombine-shl-from-extend-narrow.prelegal.mir | 16 +-
.../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 16 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 12 +-
.../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 69 +--
.../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 439 ++++++++----------
.../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 114 ++---
.../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 387 +++++++--------
.../GlobalISel/widen-i8-i16-scalar-loads.ll | 18 +-
9 files changed, 481 insertions(+), 606 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6ae8895322d6f9..a8cd974b01ab4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -374,23 +374,15 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
%zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..3780542cd87993 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -246,23 +246,15 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0, $vgpr1
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
%zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index b9cd330ee2b5f9..4ddbb0afd7fc58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1434,13 +1434,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_ffbh_i32_e32 v2, 0
+; SI-NEXT: v_add_i32_e32 v2, vcc, -1, v2
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; SI-NEXT: v_ffbh_i32_e32 v3, 0
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
-; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: v_min_u32_e32 v2, v3, v2
+; SI-NEXT: v_min_u32_e32 v2, 32, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; SI-NEXT: v_min_u32_e32 v0, 1, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -1452,13 +1450,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_i32_e32 v2, 0
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v2
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT: v_ffbh_i32_e32 v3, 0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_min_u32_e32 v2, v3, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index cc185aff9eff22..784611cf68dd23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1800,9 +1800,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
; GCN-NEXT: s_lshr_b32 s0, s1, 1
; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
@@ -1810,9 +1810,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_mov_b32 s2, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 88eb0e4b848c95..2fa5492c8a2b72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -608,34 +608,25 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v2, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_i32_24bit:
@@ -677,20 +668,6 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
@@ -711,15 +688,15 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
@@ -729,10 +706,6 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v8
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i32_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 14b30e0d79946c..80cda2e7f3c816 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2530,254 +2530,227 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
-; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
-; GISEL-NEXT: v_trunc_f32_e32 v9, v7
-; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
+; GISEL-NEXT: v_trunc_f32_e32 v7, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
+; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v7
-; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v1
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v5, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6]
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6]
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v1
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16
-; GISEL-NEXT: v_mov_b32_e32 v1, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2]
-; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v16, v13, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8]
+; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8]
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7
+; GISEL-NEXT: v_trunc_f32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7]
+; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc
+; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, v6
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v8, v5
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1
+; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4
-; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v9, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1
+; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v3, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1]
-; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6
+; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7
+; GISEL-NEXT: v_mov_b32_e32 v1, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7
-; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 2b12e4b973acb2..530f4cf53321ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -647,31 +647,23 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v3, v0, v3
-; GISEL-NEXT: v_mul_lo_u32 v3, v3, v1
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v2, v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_i32_24bit:
@@ -711,56 +703,40 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i32_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index ee7a040e41fd5e..1f4448d9a632a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3028,253 +3028,226 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
-; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
-; GISEL-NEXT: v_trunc_f32_e32 v9, v7
-; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT: v_mov_b32_e32 v3, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT: v_mov_b32_e32 v3, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1
-; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc
; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v6, v4
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2
-; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT: v_trunc_f32_e32 v9, v4
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5
-; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4
-; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v6, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v4
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
+; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0
+; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v17, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v17, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v9, v8, v5
-; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v3, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v1
-; GISEL-NEXT: v_mov_b32_e32 v1, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2]
-; GISEL-NEXT: v_subrev_i32_e32 v1, vcc, 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0, v3
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 9cd85553eb7b61..6730df000e3b8c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -380,20 +380,20 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
; GFX8-LABEL: constant_zextload_i8_align2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_add_u32 s2, s0, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: flat_store_short v[0:1], v4
+; GFX8-NEXT: flat_store_short v[2:3], v5
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_zextload_i8_align2:
@@ -404,7 +404,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v0, s[0:1] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_zextload_i8_align2:
@@ -415,7 +415,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v0, s[0:1] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%zextload = zext i8 %load to i32
>From 0ab36e76d95cf14024a2c083d190d401215bd820 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 20 Jan 2025 16:27:54 +0800
Subject: [PATCH 7/8] Do not use `wip_match_opcode`
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 +++-
llvm/include/llvm/Target/GlobalISel/Combine.td | 13 +++++++++----
2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index a51aa876e1deb0..161ca5da505962 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -841,7 +841,9 @@ class CombinerHelper {
BuildFnTy &MatchInfo) const;
/// Match shifts greater or equal to the range (bitwidth of the operation, or
- /// the source value).
+ /// the source value). When match, also return the minimum useless shift
+ /// amount that results in complete loss of the source value. if the optional
+ /// value is std::nullopt, then the shift result is undefined.
bool matchShiftsTooBig(MachineInstr &MI,
std::optional<int64_t> &MatchInfo) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ae0856550d9356..7230bfaba7f535 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,16 +306,21 @@ def ptr_add_immed_chain : GICombineRule<
[{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
(apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
+def shift_const_op : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op,
+ [G_SHL, G_ASHR, G_LSHR],
+ (pattern (G_CONSTANT $amt, $imm), (op $dst, $shifted, $amt)))>;
def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
def shifts_too_big : GICombineRule<
(defs root:$root, shift_result_matchdata:$matchinfo),
- (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
- [{ return Helper.matchShiftsTooBig(*${root}, ${matchinfo}); }]),
+ (match (shift_const_op $root):$mi,
+ [{ return Helper.matchShiftsTooBig(*${mi}, ${matchinfo}); }]),
(apply [{
if (${matchinfo}) {
- Helper.replaceInstWithConstant(*${root}, *${matchinfo});
+ Helper.replaceInstWithConstant(*${mi}, *${matchinfo});
} else {
- Helper.replaceInstWithUndef(*${root});
+ Helper.replaceInstWithUndef(*${mi});
}
}])>;
>From 92b3db3c48d6f4675073443c3abe335354785f4e Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 20 Jan 2025 18:34:40 +0800
Subject: [PATCH 8/8] update according to comments
---
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h | 6 ++----
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 10 ++++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 161ca5da505962..9b78342c8fc393 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -840,10 +840,8 @@ class CombinerHelper {
bool matchRedundantBinOpInEquality(MachineInstr &MI,
BuildFnTy &MatchInfo) const;
- /// Match shifts greater or equal to the range (bitwidth of the operation, or
- /// the source value). When match, also return the minimum useless shift
- /// amount that results in complete loss of the source value. if the optional
- /// value is std::nullopt, then the shift result is undefined.
+ /// Match shifts greater or equal to the range (the bitwidth of the result
+ /// datatype, or the effective bitwidth of the source value).
bool matchShiftsTooBig(MachineInstr &MI,
std::optional<int64_t> &MatchInfo) const;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4821c21301946b..b193d8bb0aa18a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6590,9 +6590,11 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
return CmpInst::isEquality(Pred) && Y.isValid();
}
+/// Return the minimum useless shift amount that results in complete loss of the
+/// source value. Return std::nullopt when it cannot determine a value.
static std::optional<unsigned>
-getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
- std::optional<int64_t> &Result) {
+getMinUselessShift(KnownBits ValueKB, unsigned Opcode,
+ std::optional<int64_t> &Result) {
assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
auto SignificantBits = 0;
@@ -6636,8 +6638,8 @@ bool CombinerHelper::matchShiftsTooBig(
MatchInfo = std::nullopt;
return true;
}
- auto OptMaxUsefulShift = getMaxUsefulShift(KB->getKnownBits(ShiftVal),
- MI.getOpcode(), MatchInfo);
+ auto OptMaxUsefulShift = getMinUselessShift(KB->getKnownBits(ShiftVal),
+ MI.getOpcode(), MatchInfo);
return OptMaxUsefulShift && CI->uge(*OptMaxUsefulShift);
};
return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
More information about the llvm-commits
mailing list