[llvm] [GlobalISel] Allow expansion of urem by constant in prelegalizer (PR #145914)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 26 11:04:48 PDT 2025


https://github.com/jyli0116 updated https://github.com/llvm/llvm-project/pull/145914

>From 8ba8f12fbb65de31bb26f864aa85d48885fc6dd8 Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 26 Jun 2025 14:47:36 +0000
Subject: [PATCH 1/3] [GlobalIsel] Allow expansion of urem by constant in
 prelegalizer

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  10 +-
 .../include/llvm/Target/GlobalISel/Combine.td |  10 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  69 +-
 .../AArch64/GlobalISel/combine_urem.ll        | 243 +++++++
 llvm/test/CodeGen/AArch64/pr58431.ll          |   8 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll     | 104 +--
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     | 595 +++---------------
 7 files changed, 441 insertions(+), 598 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index c15263e0b06f8..9139425658480 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -693,18 +693,22 @@ class CombinerHelper {
   /// feeding a G_AND instruction \p MI.
   bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  /// Given an G_UDIV \p MI expressing a divide by constant, return an
-  /// expression that implements it by multiplying by a magic number.
+  /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
+  /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivUsingMul(MachineInstr &MI) const;
+  MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
   /// Combine G_UDIV by constant into a multiply by magic constant.
   bool matchUDivByConst(MachineInstr &MI) const;
   void applyUDivByConst(MachineInstr &MI) const;
+  /// Combine G_UREM by constant into a multiply by magic constant.
+  bool matchURemByConst(MachineInstr &MI) const;
+  void applyURemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
   MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
+  /// Combine G_SDIV by constant into a multiply by magic constant.
   bool matchSDivByConst(MachineInstr &MI) const;
   void applySDivByConst(MachineInstr &MI) const;
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4a92dc16c1bf4..52cbbf91849b6 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1156,6 +1156,14 @@ def udiv_by_pow2 : GICombineRule<
 def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
                                       sdiv_by_pow2, udiv_by_pow2]>;
 
+def urem_by_const : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UREM):$root,
+   [{ return Helper.matchURemByConst(*${root}); }]),
+  (apply [{ Helper.applyURemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const]>;
+
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
   (match (wip_match_opcode G_PTR_ADD):$root,
@@ -2048,7 +2056,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     constant_fold_cast_op, fabs_fneg_fold,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
-    sub_add_reg, select_to_minmax,
+    intrem_combines, sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     simplify_neg_minmax, combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b1e851183de0d..c511f27a5e8ce 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5288,12 +5288,13 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
-  auto &UDiv = cast<GenericMachineInstr>(MI);
-  Register Dst = UDiv.getReg(0);
-  Register LHS = UDiv.getReg(1);
-  Register RHS = UDiv.getReg(2);
+MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+  unsigned opcode = MI.getOpcode();
+  assert(opcode == TargetOpcode::G_UDIV || opcode == TargetOpcode::G_UREM);
+  auto &UDivorRem = cast<GenericMachineInstr>(MI);
+  Register Dst = UDivorRem.getReg(0);
+  Register LHS = UDivorRem.getReg(1);
+  Register RHS = UDivorRem.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5446,7 +5447,13 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
   auto IsOne = MIB.buildICmp(
       CmpInst::Predicate::ICMP_EQ,
       Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
-  return MIB.buildSelect(Ty, IsOne, LHS, Q);
+  auto ret = MIB.buildSelect(Ty, IsOne, LHS, Q);
+
+  if (opcode == TargetOpcode::G_UREM) {
+    auto Prod = MIB.buildMul(Ty, ret, RHS);
+    return MIB.buildSub(Ty, LHS, Prod);
+  }
+  return ret;
 }
 
 bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
@@ -5494,7 +5501,53 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
 }
 
 void CombinerHelper::applyUDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivUsingMul(MI);
+  auto *NewMI = buildUDivorURemUsingMul(MI);
+  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
+}
+
+bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UREM);
+  Register Dst = MI.getOperand(0).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(Dst);
+
+  auto &MF = *MI.getMF();
+  AttributeList Attr = MF.getFunction().getAttributes();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, Ctx), Attr))
+    return false;
+
+  // Don't do this for minsize because the instruction sequence is usually
+  // larger.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  auto *RHSDef = MRI.getVRegDef(RHS);
+  if (!isConstantOrConstantVector(*RHSDef, MRI))
+    return false;
+
+  // Don't do this if the types are not going to be legal.
+  if (LI) {
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_ICMP,
+             {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
+              DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+        return false;
+  }
+
+  return matchUnaryPredicate(
+      MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
+}
+
+void CombinerHelper::applyURemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildUDivorURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll
new file mode 100644
index 0000000000000..0cf827410c30c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=CHECK-GI
+
+
+define i8 @test7s8(i8 %a) {
+; CHECK-SD-LABEL: test7s8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #37 // =0x25
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7s8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+  %r = urem i8 %a, 7
+  ret i8 %r
+}
+
+define i8 @test100s8(i8 %a) {
+; CHECK-SD-LABEL: test100s8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #41 // =0x29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #12
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100s8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    lsr w8, w8, #4
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+  %r = urem i8 %a, 100
+  ret i8 %r
+}
+
+define i32 @test7s32(i32 %a) {
+; CHECK-SD-LABEL: test7s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk w8, #9362, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT:    lsr w8, w8, #2
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+ %c = urem i32 %a, 7
+ ret i32 %c
+}
+
+define i32 @test100s32(i32 %a) {
+; CHECK-SD-LABEL: test100s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    lsr w8, w8, #5
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+ %c = urem i32 %a, 100
+ ret i32 %c
+}
+
+define <8 x i16> @test7v8s16(<8 x i16> %a) {
+; CHECK-SD-LABEL: test7v8s16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7v8s16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI4_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-GI-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+  %r = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @test100v8s16(<8 x i16> %a) {
+; CHECK-SD-LABEL: test100v8s16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v3.4s, v2.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100v8s16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI5_0
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-NEXT:    umull2 v3.4s, v1.8h, v2.8h
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.8h, #100
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+  %r = urem <8 x i16> %a, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @test7v4s32(<4 x i32> %a) {
+; CHECK-SD-LABEL: test7v4s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-SD-NEXT:    movi v2.4s, #7
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7v4s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI6_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+  %r = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test100v4s32(<4 x i32> %a) {
+; CHECK-SD-LABEL: test100v4s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    movi v2.4s, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100v4s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI7_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+  %r = urem <4 x i32> %a, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %r
+}
+
diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll
index 88bab4af95d64..467ceb062f249 100644
--- a/llvm/test/CodeGen/AArch64/pr58431.ll
+++ b/llvm/test/CodeGen/AArch64/pr58431.ll
@@ -4,10 +4,12 @@
 define i32 @f(i64 %0) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    mov x8, #-7378697629483820647 // =0x9999999999999999
 ; CHECK-NEXT:    mov w9, w0
-; CHECK-NEXT:    udiv x10, x9, x8
-; CHECK-NEXT:    msub x0, x10, x8, x9
+; CHECK-NEXT:    mov w10, #10 // =0xa
+; CHECK-NEXT:    eor x8, x8, #0x8000000000000003
+; CHECK-NEXT:    umulh x8, x9, x8
+; CHECK-NEXT:    msub x0, x8, x10, x9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %2 = trunc i64 %0 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 24ec4fa48f778..6ae2f56f6ae6d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -211,91 +211,41 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, 0x4996c7d8
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0xb2a50881
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 20, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 1235195
   ret i32 %result
 }
 
 define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
-; GISEL-LABEL: v_urem_v2i32_oddk_denom:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_urem_v2i32_oddk_denom:
-; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CGP-NEXT:    v_mov_b32_e32 v3, 0xffed2705
-; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v2, v3
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v0, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_urem_v2i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0xb2a50881
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v4, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v4
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v1, v2
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 1, v6
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 20, v4
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f6a228614a27e..2a1bf4bf068f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -968,523 +968,106 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0xffed2705
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v3
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0xd9528440
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, vcc, v1, v3, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[6:7]
-; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, 0x12d8fb, v5
-; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v2, s[4:5]
-; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_lshr_b64 v[2:3], v[2:3], 20
+; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 1235195
   ret i64 %result
 }
 
 define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
-; GISEL-LABEL: v_urem_v2i64_oddk_denom:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    s_mov_b32 s4, 1
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
-; GISEL-NEXT:    s_mov_b32 s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v6
-; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v6
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v17, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v7, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v16, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v18, v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v11, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v14, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v18, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v1, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v17, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    s_mov_b64 s[4:5], vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9
-; GISEL-NEXT:    v_sub_i32_e64 v12, s[6:7], v0, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v13, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v10, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_urem_v2i64_oddk_denom:
-; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; CGP-NEXT:    v_mov_b32_e32 v7, 0xffed2705
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v7, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, v7
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v2, v6
-; CGP-NEXT:    v_mul_lo_u32 v17, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v18, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v9
-; CGP-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v11
-; CGP-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
-; CGP-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    s_mov_b64 s[4:5], vcc
-; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9
-; CGP-NEXT:    v_sub_i32_e64 v12, s[6:7], v0, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v13, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v10, -1, v10, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
-; CGP-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_urem_v2i64_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0xd9528440
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v4
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v13, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v14, v3, v4
+; CHECK-NEXT:    v_mul_lo_u32 v15, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v16, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v17, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v18, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v19, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v4, v18
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v14, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v15, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v19, v7
+; CHECK-NEXT:    v_lshr_b64 v[4:5], v[4:5], 20
+; CHECK-NEXT:    v_lshr_b64 v[6:7], v[6:7], 20
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v5, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
 }

>From 395379e4f133c0398a83e5d9e0c9ae0235972e1c Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 26 Jun 2025 15:00:01 +0000
Subject: [PATCH 2/3] formatting

---
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index c511f27a5e8ce..6ef23a68092db 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5539,7 +5539,7 @@ bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
               DstTy}}))
       return false;
     if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
-        return false;
+      return false;
   }
 
   return matchUnaryPredicate(

>From c5c3e3649cf055db9c58508fa3a3b06c47e94f6c Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 26 Jun 2025 17:56:30 +0000
Subject: [PATCH 3/3] [GlobalISel] Combine matchUDiv/URem and applyUDiv/URem
 for less repeated code

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  9 +--
 .../include/llvm/Target/GlobalISel/Combine.td |  8 +--
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 63 ++++---------------
 3 files changed, 18 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9139425658480..7d7b5364d6b68 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -697,12 +697,9 @@ class CombinerHelper {
   /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
   MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
-  /// Combine G_UDIV by constant into a multiply by magic constant.
-  bool matchUDivByConst(MachineInstr &MI) const;
-  void applyUDivByConst(MachineInstr &MI) const;
-  /// Combine G_UREM by constant into a multiply by magic constant.
-  bool matchURemByConst(MachineInstr &MI) const;
-  void applyURemByConst(MachineInstr &MI) const;
+  /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
+  bool matchUDivorURemByConst(MachineInstr &MI) const;
+  void applyUDivorURemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 52cbbf91849b6..84675b41c063e 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,8 +1132,8 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
 def udiv_by_const : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UDIV):$root,
-   [{ return Helper.matchUDivByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivByConst(*${root}); }])>;
+   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
 
 def sdiv_by_const : GICombineRule<
   (defs root:$root),
@@ -1159,8 +1159,8 @@ def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
 def urem_by_const : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UREM):$root,
-   [{ return Helper.matchURemByConst(*${root}); }]),
-  (apply [{ Helper.applyURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
 
 def intrem_combines : GICombineGroup<[urem_by_const]>;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 6ef23a68092db..94d1a347a8b0c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5289,8 +5289,8 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
 }
 
 MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
-  unsigned opcode = MI.getOpcode();
-  assert(opcode == TargetOpcode::G_UDIV || opcode == TargetOpcode::G_UREM);
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   auto &UDivorRem = cast<GenericMachineInstr>(MI);
   Register Dst = UDivorRem.getReg(0);
   Register LHS = UDivorRem.getReg(1);
@@ -5449,15 +5449,16 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
       Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
   auto ret = MIB.buildSelect(Ty, IsOne, LHS, Q);
 
-  if (opcode == TargetOpcode::G_UREM) {
+  if (Opcode == TargetOpcode::G_UREM) {
     auto Prod = MIB.buildMul(Ty, ret, RHS);
     return MIB.buildSub(Ty, LHS, Prod);
   }
   return ret;
 }
 
-bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -5474,7 +5475,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
   if (MF.getFunction().hasMinSize())
     return false;
 
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+  if (Opcode == TargetOpcode::G_UDIV &&
+      MI.getFlag(MachineInstr::MIFlag::IsExact)) {
     return matchUnaryPredicate(
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
@@ -5494,51 +5496,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
              {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
               DstTy}}))
       return false;
-  }
-
-  return matchUnaryPredicate(
-      MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
-}
-
-void CombinerHelper::applyUDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivorURemUsingMul(MI);
-  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
-}
-
-bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UREM);
-  Register Dst = MI.getOperand(0).getReg();
-  Register RHS = MI.getOperand(2).getReg();
-  LLT DstTy = MRI.getType(Dst);
-
-  auto &MF = *MI.getMF();
-  AttributeList Attr = MF.getFunction().getAttributes();
-  const auto &TLI = getTargetLowering();
-  LLVMContext &Ctx = MF.getFunction().getContext();
-  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, Ctx), Attr))
-    return false;
-
-  // Don't do this for minsize because the instruction sequence is usually
-  // larger.
-  if (MF.getFunction().hasMinSize())
-    return false;
-
-  auto *RHSDef = MRI.getVRegDef(RHS);
-  if (!isConstantOrConstantVector(*RHSDef, MRI))
-    return false;
-
-  // Don't do this if the types are not going to be legal.
-  if (LI) {
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
-      return false;
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
-      return false;
-    if (!isLegalOrBeforeLegalizer(
-            {TargetOpcode::G_ICMP,
-             {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
-              DstTy}}))
-      return false;
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+    if (Opcode == TargetOpcode::G_UREM &&
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
       return false;
   }
 
@@ -5546,7 +5505,7 @@ bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyURemByConst(MachineInstr &MI) const {
+void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
   auto *NewMI = buildUDivorURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }



More information about the llvm-commits mailing list