[llvm] [GISel] Combine out-of-range shifts with value to 0 or -1 (PR #123510)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 20 02:27:57 PST 2025


https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/123510

>From 63c7775f88cb90796ad2b69c44ce756591cb3545 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 12:42:52 +0800
Subject: [PATCH 1/7] First commit

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  | 28 +++++------
 .../include/llvm/Target/GlobalISel/Combine.td | 13 +++--
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 49 ++++++++++++++++++-
 3 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 94e36e412b0cf7..da558de5946559 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -115,18 +115,13 @@ class CombinerHelper {
 
 public:
   CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
-                 bool IsPreLegalize,
-                 GISelKnownBits *KB = nullptr,
+                 bool IsPreLegalize, GISelKnownBits *KB = nullptr,
                  MachineDominatorTree *MDT = nullptr,
                  const LegalizerInfo *LI = nullptr);
 
-  GISelKnownBits *getKnownBits() const {
-    return KB;
-  }
+  GISelKnownBits *getKnownBits() const { return KB; }
 
-  MachineIRBuilder &getBuilder() const {
-    return Builder;
-  }
+  MachineIRBuilder &getBuilder() const { return Builder; }
 
   const TargetLowering &getTargetLowering() const;
 
@@ -150,8 +145,10 @@ class CombinerHelper {
   /// is a legal integer constant type on the target.
   bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
 
-  /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
-  void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
+  /// MachineRegisterInfo::replaceRegWith() and inform the observer of the
+  /// changes
+  void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
+                      Register ToReg) const;
 
   /// Replace a single register operand with a new register and inform the
   /// observer of the changes.
@@ -482,12 +479,12 @@ class CombinerHelper {
   bool matchEqualDefs(const MachineOperand &MOP1,
                       const MachineOperand &MOP2) const;
 
-  /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value equal to
-  /// \p C.
+  /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value
+  /// equal to \p C.
   bool matchConstantOp(const MachineOperand &MOP, int64_t C) const;
 
-  /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value exactly
-  /// equal to \p C.
+  /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value
+  /// exactly equal to \p C.
   bool matchConstantFPOp(const MachineOperand &MOP, double C) const;
 
   /// @brief Checks if constant at \p ConstIdx is larger than \p MI 's bitwidth
@@ -841,7 +838,8 @@ class CombinerHelper {
                                      BuildFnTy &MatchInfo) const;
 
   /// Match shifts greater or equal to the bitwidth of the operation.
-  bool matchShiftsTooBig(MachineInstr &MI) const;
+  bool matchShiftsTooBig(MachineInstr &MI,
+                         std::optional<int64_t> &MatchInfo) const;
 
   /// Match constant LHS ops that should be commuted.
   bool matchCommuteConstantToRHS(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8641eabbdd84c6..ae0856550d9356 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,11 +306,18 @@ def ptr_add_immed_chain : GICombineRule<
          [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
 
+def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
 def shifts_too_big : GICombineRule<
-  (defs root:$root),
+  (defs root:$root, shift_result_matchdata:$matchinfo),
   (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
-         [{ return Helper.matchShiftsTooBig(*${root}); }]),
-  (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
+         [{ return Helper.matchShiftsTooBig(*${root}, ${matchinfo}); }]),
+  (apply [{
+    if (${matchinfo}) {
+      Helper.replaceInstWithConstant(*${root}, *${matchinfo});
+    } else {
+      Helper.replaceInstWithUndef(*${root});
+    }
+  }])>;
 
 // Fold shift (shift base x), y -> shift base, (x+y), if shifts are same
 def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da7198c..6c04337ad73e0b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DivisionByConstantInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>
@@ -6590,12 +6591,56 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
   return CmpInst::isEquality(Pred) && Y.isValid();
 }
 
-bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) const {
+static std::optional<unsigned>
+getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
+                  std::optional<int64_t> &Result) {
+  assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+         Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
+  auto SignificantBits = 0;
+  switch (Opcode) {
+  case TargetOpcode::G_SHL:
+    SignificantBits = ValueKB.countMinTrailingZeros();
+    Result = 0;
+    break;
+  case TargetOpcode::G_LSHR:
+    Result = 0;
+    SignificantBits = ValueKB.countMinLeadingZeros();
+    break;
+  case TargetOpcode::G_ASHR:
+    if (ValueKB.isNonNegative()) {
+      SignificantBits = ValueKB.countMinLeadingZeros();
+      Result = 0;
+    } else if (ValueKB.isNegative()) {
+      SignificantBits = ValueKB.countMinLeadingOnes();
+      Result = -1;
+    } else {
+      // Cannot determine shift result.
+      Result = std::nullopt;
+      return false;
+    }
+    break;
+  default:
+    break;
+  }
+  return ValueKB.getBitWidth() - SignificantBits;
+}
+
+bool CombinerHelper::matchShiftsTooBig(
+    MachineInstr &MI, std::optional<int64_t> &MatchInfo) const {
+  Register ShiftVal = MI.getOperand(1).getReg();
   Register ShiftReg = MI.getOperand(2).getReg();
   LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
   auto IsShiftTooBig = [&](const Constant *C) {
     auto *CI = dyn_cast<ConstantInt>(C);
-    return CI && CI->uge(ResTy.getScalarSizeInBits());
+    if (!CI)
+      return false;
+    if (CI->uge(ResTy.getScalarSizeInBits())) {
+      MatchInfo = std::nullopt;
+      return true;
+    }
+    auto OptMaxUsefulShift = getMaxUsefulShift(KB->getKnownBits(ShiftVal),
+                                               MI.getOpcode(), MatchInfo);
+    return OptMaxUsefulShift && CI->uge(*OptMaxUsefulShift);
   };
   return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
 }

>From 56dfed9473140716806cfb41ed7371d5cb1e0030 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 20:29:57 +0800
Subject: [PATCH 2/7] Fix up

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  | 28 +++++++++++--------
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  1 -
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index da558de5946559..a51aa876e1deb0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -115,13 +115,18 @@ class CombinerHelper {
 
 public:
   CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
-                 bool IsPreLegalize, GISelKnownBits *KB = nullptr,
+                 bool IsPreLegalize,
+                 GISelKnownBits *KB = nullptr,
                  MachineDominatorTree *MDT = nullptr,
                  const LegalizerInfo *LI = nullptr);
 
-  GISelKnownBits *getKnownBits() const { return KB; }
+  GISelKnownBits *getKnownBits() const {
+    return KB;
+  }
 
-  MachineIRBuilder &getBuilder() const { return Builder; }
+  MachineIRBuilder &getBuilder() const {
+    return Builder;
+  }
 
   const TargetLowering &getTargetLowering() const;
 
@@ -145,10 +150,8 @@ class CombinerHelper {
   /// is a legal integer constant type on the target.
   bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
 
-  /// MachineRegisterInfo::replaceRegWith() and inform the observer of the
-  /// changes
-  void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
-                      Register ToReg) const;
+  /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
+  void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
 
   /// Replace a single register operand with a new register and inform the
   /// observer of the changes.
@@ -479,12 +482,12 @@ class CombinerHelper {
   bool matchEqualDefs(const MachineOperand &MOP1,
                       const MachineOperand &MOP2) const;
 
-  /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value
-  /// equal to \p C.
+  /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value equal to
+  /// \p C.
   bool matchConstantOp(const MachineOperand &MOP, int64_t C) const;
 
-  /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value
-  /// exactly equal to \p C.
+  /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value exactly
+  /// equal to \p C.
   bool matchConstantFPOp(const MachineOperand &MOP, double C) const;
 
   /// @brief Checks if constant at \p ConstIdx is larger than \p MI 's bitwidth
@@ -837,7 +840,8 @@ class CombinerHelper {
   bool matchRedundantBinOpInEquality(MachineInstr &MI,
                                      BuildFnTy &MatchInfo) const;
 
-  /// Match shifts greater or equal to the bitwidth of the operation.
+  /// Match shifts greater or equal to the range (bitwidth of the operation, or
+  /// the source value).
   bool matchShiftsTooBig(MachineInstr &MI,
                          std::optional<int64_t> &MatchInfo) const;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 6c04337ad73e0b..b4cf1655b04728 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DivisionByConstantInfo.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>

>From ce31b1f5be7b6f36033e539e03b4d2d44972151d Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 22:14:39 +0800
Subject: [PATCH 3/7] adding AMDGPU tests

---
 .../AMDGPU/GlobalISel/combine-shifts.mir      | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
new file mode 100644
index 00000000000000..157ab5d7bb03dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            combine_ashr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr31
+
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: combine_ashr
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+    ; CHECK-NEXT: SI_RETURN
+    %9:_(s32) = COPY $vgpr0
+    %10:_(s32) = COPY $vgpr1
+    %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+    %12:_(s32) = G_CONSTANT i32 10
+    %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %13:_(s32) = G_ASHR %11, %12(s32)
+    G_STORE %13(s32), %0(p0) :: (store (s32))
+    SI_RETURN
+
+...
+---
+name:            combine_lshr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr31
+
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: combine_lshr
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+    ; CHECK-NEXT: SI_RETURN
+    %9:_(s32) = COPY $vgpr0
+    %10:_(s32) = COPY $vgpr1
+    %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+    %12:_(s32) = G_CONSTANT i32 10
+    %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %13:_(s32) = G_LSHR %11, %12(s32)
+    G_STORE %13(s32), %0(p0) :: (store (s32))
+    SI_RETURN
+
+...
+---
+name:            combine_shl
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr31
+
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: combine_shl
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+    ; CHECK-NEXT: SI_RETURN
+    %9:_(s32) = COPY $vgpr0
+    %10:_(s32) = COPY $vgpr1
+    %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+    %12:_(s32) = G_CONSTANT i32 16
+    %11:_(s32) = G_CONSTANT i32 4294901760
+    %13:_(s32) = G_SHL %11, %12(s32)
+    G_STORE %13(s32), %0(p0) :: (store (s32))
+    SI_RETURN
+
+...
+---
+name:            combine_ashr2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr31
+
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: combine_ashr2
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+    ; CHECK-NEXT: SI_RETURN
+    %9:_(s32) = COPY $vgpr0
+    %10:_(s32) = COPY $vgpr1
+    %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+    %12:_(s32) = G_CONSTANT i32 16
+    %11:_(s8) = G_CONSTANT i8 -126
+    G_STORE %13(s8), %0(p0) :: (store (s8))
+    SI_RETURN
+
+...

>From d0043990b2e248559a3ee38a735225ae532152d5 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 22:20:23 +0800
Subject: [PATCH 4/7] Adding new tests

---
 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
index 157ab5d7bb03dd..d3a3827f35a18a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -100,14 +100,15 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -1
+    ; CHECK-NEXT: G_STORE [[C]](s8), [[MV]](p0) :: (store (s8))
     ; CHECK-NEXT: SI_RETURN
     %9:_(s32) = COPY $vgpr0
     %10:_(s32) = COPY $vgpr1
     %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
-    %12:_(s32) = G_CONSTANT i32 16
-    %11:_(s8) = G_CONSTANT i8 -126
+    %12:_(s32) = G_CONSTANT i32 1
+    %11:_(s8) = G_CONSTANT i8 -2
+    %13:_(s8) = G_ASHR %11, %12(s32)
     G_STORE %13(s8), %0(p0) :: (store (s8))
     SI_RETURN
 

>From ec1618f69f72ee0edbbec93664ee141ff55dbd7d Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 19 Jan 2025 22:50:46 +0800
Subject: [PATCH 5/7] updates

---
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b4cf1655b04728..4821c21301946b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6615,7 +6615,6 @@ getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
     } else {
       // Cannot determine shift result.
       Result = std::nullopt;
-      return false;
     }
     break;
   default:

>From 12fb70efe614b35a86c9b7880a42770be0b71a3f Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 20 Jan 2025 01:19:05 +0800
Subject: [PATCH 6/7] Fix AMDGPU global isel test failures

---
 ...mbine-shl-from-extend-narrow.postlegal.mir |  16 +-
 ...ombine-shl-from-extend-narrow.prelegal.mir |  16 +-
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        |  16 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  12 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll     |  69 +--
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     | 439 ++++++++----------
 .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll     | 114 ++---
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     | 387 +++++++--------
 .../GlobalISel/widen-i8-i16-scalar-loads.ll   |  18 +-
 9 files changed, 481 insertions(+), 606 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6ae8895322d6f9..a8cd974b01ab4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -374,23 +374,15 @@ body:             |
     ; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
-    ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
-    ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+    ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+    ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     ;
     ; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
-    ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
-    ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+    ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     %zero:_(s16) = G_CONSTANT i16 0
     %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..3780542cd87993 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -246,23 +246,15 @@ body:             |
     ; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
-    ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
-    ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+    ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+    ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     ;
     ; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
-    ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
-    ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+    ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     %zero:_(s16) = G_CONSTANT i16 0
     %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index b9cd330ee2b5f9..4ddbb0afd7fc58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1434,13 +1434,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_ffbh_i32_e32 v2, 0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; SI-NEXT:    v_ffbh_i32_e32 v3, 0
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    v_min_u32_e32 v2, v3, v2
+; SI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
 ; SI-NEXT:    v_min_u32_e32 v0, 1, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1452,13 +1450,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_i32_e32 v2, 0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT:    v_ffbh_i32_e32 v3, 0
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_min_u32_e32 v2, v3, v2
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    v_min_u32_e32 v0, 1, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index cc185aff9eff22..784611cf68dd23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1800,9 +1800,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
 ; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], 1
 ; GCN-NEXT:    s_lshr_b32 s0, s1, 1
 ; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    s_lshr_b32 s2, s3, 1
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65_33:
@@ -1810,9 +1810,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
 ; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], 1
 ; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT:    s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
+; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 88eb0e4b848c95..2fa5492c8a2b72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -608,34 +608,25 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v1
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_i32_24bit:
@@ -677,20 +668,6 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT:    v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -711,15 +688,15 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v3
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
@@ -729,10 +706,6 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 14b30e0d79946c..80cda2e7f3c816 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2530,254 +2530,227 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v4
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v3
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v3
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v11, v7
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], 0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v2
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v9, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v6, v1
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v4
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0
-; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6]
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6]
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v18, v1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v12, v3, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v1
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v13, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v16
-; GISEL-NEXT:    v_mov_b32_e32 v1, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v16, v13, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], 0, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v3
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8]
+; GISEL-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v6
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 0, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v2
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, -1, v16, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v15, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v6
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v2, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v15, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v12, v2
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v15, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v0
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v12
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v1
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v5
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v1
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v12, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v1, v10, v1
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v8, v4
-; GISEL-NEXT:    v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v0, v9, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v6
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v6
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v8, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v1
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1]
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v5, v6
+; GISEL-NEXT:    v_mul_hi_u32 v1, 0, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v1, v7
+; GISEL-NEXT:    v_mov_b32_e32 v1, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v6
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v6
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 2b12e4b973acb2..530f4cf53321ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -647,31 +647,23 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v1
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT:    v_mul_lo_u32 v2, v2, v1
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_i32_24bit:
@@ -711,56 +703,40 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v1, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v1, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index ee7a040e41fd5e..1f4448d9a632a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3028,253 +3028,226 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
 ; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v3
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT:    v_mov_b32_e32 v3, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
+; GISEL-NEXT:    v_mul_lo_u32 v3, v5, v7
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v14, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
+; GISEL-NEXT:    v_mul_lo_u32 v3, v5, v7
+; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v8
+; GISEL-NEXT:    v_mul_lo_u32 v3, v5, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v11, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v6, v9, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v7, v1
-; GISEL-NEXT:    v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v11, v5
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, v6, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v4
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT:    v_sub_i32_e64 v15, s[4:5], 0, v2
-; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v4
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v0
+; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v6
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, v18, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v5
-; GISEL-NEXT:    v_mul_hi_u32 v19, v14, v4
-; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v10, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v19
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v0, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v0, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v19, v17, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v19, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; GISEL-NEXT:    v_mul_hi_u32 v5, v17, v5
+; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v18, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
-; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v11, v1
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v7, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v18, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v4
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 0, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v15, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v16, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v5
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v9, v4
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v11, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v5
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v4, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v11, v4
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v10, v0
+; GISEL-NEXT:    v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v13, v6, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v6, v2, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v17, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v17, v0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v5
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v4, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v3, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2]
-; GISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0, v3
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v11, 0, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, -1, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 9cd85553eb7b61..6730df000e3b8c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -380,20 +380,20 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
 ; GFX8-LABEL: constant_zextload_i8_align2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s0, 2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_store_short v[0:1], v3
+; GFX8-NEXT:    flat_store_short v[0:1], v4
+; GFX8-NEXT:    flat_store_short v[2:3], v5
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: constant_zextload_i8_align2:
@@ -404,7 +404,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
 ; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT:    global_store_short v0, v0, s[0:1] offset:2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: constant_zextload_i8_align2:
@@ -415,7 +415,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX10-NEXT:    global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT:    global_store_short v0, v0, s[0:1] offset:2
 ; GFX10-NEXT:    s_endpgm
   %load = load i8, ptr addrspace(1) %in, align 2
   %zextload = zext i8 %load to i32

>From 0ab36e76d95cf14024a2c083d190d401215bd820 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 20 Jan 2025 16:27:54 +0800
Subject: [PATCH 7/7] Do not use `wip_match_opcode`

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h        |  4 +++-
 llvm/include/llvm/Target/GlobalISel/Combine.td      | 13 +++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index a51aa876e1deb0..161ca5da505962 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -841,7 +841,9 @@ class CombinerHelper {
                                      BuildFnTy &MatchInfo) const;
 
   /// Match shifts greater or equal to the range (bitwidth of the operation, or
-  /// the source value).
+  /// the source value). When match, also return the minimum useless shift
+  /// amount that results in complete loss of the source value. if the optional
+  /// value is std::nullopt, then the shift result is undefined.
   bool matchShiftsTooBig(MachineInstr &MI,
                          std::optional<int64_t> &MatchInfo) const;
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ae0856550d9356..7230bfaba7f535 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,16 +306,21 @@ def ptr_add_immed_chain : GICombineRule<
          [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
 
+def shift_const_op : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op,
+           [G_SHL, G_ASHR, G_LSHR],
+           (pattern (G_CONSTANT $amt, $imm), (op $dst, $shifted, $amt)))>;
 def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
 def shifts_too_big : GICombineRule<
   (defs root:$root, shift_result_matchdata:$matchinfo),
-  (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
-         [{ return Helper.matchShiftsTooBig(*${root}, ${matchinfo}); }]),
+  (match (shift_const_op $root):$mi,
+         [{ return Helper.matchShiftsTooBig(*${mi}, ${matchinfo}); }]),
   (apply [{
     if (${matchinfo}) {
-      Helper.replaceInstWithConstant(*${root}, *${matchinfo});
+      Helper.replaceInstWithConstant(*${mi}, *${matchinfo});
     } else {
-      Helper.replaceInstWithUndef(*${root});
+      Helper.replaceInstWithUndef(*${mi});
     }
   }])>;
 



More information about the llvm-commits mailing list