[llvm] 9c0743f - [GlobalISel] Allow expansion of urem by constant in prelegalizer (#145914)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 2 05:46:40 PDT 2025


Author: jyli0116
Date: 2025-07-02T13:46:36+01:00
New Revision: 9c0743fbc5ba38ae5f66444d144d314326dc8468

URL: https://github.com/llvm/llvm-project/commit/9c0743fbc5ba38ae5f66444d144d314326dc8468
DIFF: https://github.com/llvm/llvm-project/commit/9c0743fbc5ba38ae5f66444d144d314326dc8468.diff

LOG: [GlobalISel] Allow expansion of urem by constant in prelegalizer (#145914)

This patch allows urem by a constant to be expanded more efficiently to
avoid the need for expensive udiv instructions. This is part of the
resolution to issue #118090

Added: 
    llvm/test/CodeGen/AArch64/rem-by-const.ll

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AArch64/pr58431.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index c15263e0b06f8..7d7b5364d6b68 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -693,18 +693,19 @@ class CombinerHelper {
   /// feeding a G_AND instruction \p MI.
   bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  /// Given an G_UDIV \p MI expressing a divide by constant, return an
-  /// expression that implements it by multiplying by a magic number.
+  /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
+  /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivUsingMul(MachineInstr &MI) const;
-  /// Combine G_UDIV by constant into a multiply by magic constant.
-  bool matchUDivByConst(MachineInstr &MI) const;
-  void applyUDivByConst(MachineInstr &MI) const;
+  MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
+  /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
+  bool matchUDivorURemByConst(MachineInstr &MI) const;
+  void applyUDivorURemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
   MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
+  /// Combine G_SDIV by constant into a multiply by magic constant.
   bool matchSDivByConst(MachineInstr &MI) const;
   void applySDivByConst(MachineInstr &MI) const;
 

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4a92dc16c1bf4..6033d80e717d3 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,8 +1132,8 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
 def udiv_by_const : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UDIV):$root,
-   [{ return Helper.matchUDivByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivByConst(*${root}); }])>;
+   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
 
 def sdiv_by_const : GICombineRule<
   (defs root:$root),
@@ -1156,6 +1156,14 @@ def udiv_by_pow2 : GICombineRule<
 def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
                                       sdiv_by_pow2, udiv_by_pow2]>;
 
+def urem_by_const : GICombineRule<
+  (defs root:$root),
+  (match (G_UREM $dst, $x, $y):$root,
+   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const]>;
+
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
   (match (wip_match_opcode G_PTR_ADD):$root,
@@ -2048,7 +2056,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     constant_fold_cast_op, fabs_fneg_fold,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
-    sub_add_reg, select_to_minmax,
+    intrem_combines, sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     simplify_neg_minmax, combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 05dd269d48921..3b11d0848d300 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5295,12 +5295,13 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
-  auto &UDiv = cast<GenericMachineInstr>(MI);
-  Register Dst = UDiv.getReg(0);
-  Register LHS = UDiv.getReg(1);
-  Register RHS = UDiv.getReg(2);
+MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
+  auto &UDivorRem = cast<GenericMachineInstr>(MI);
+  Register Dst = UDivorRem.getReg(0);
+  Register LHS = UDivorRem.getReg(1);
+  Register RHS = UDivorRem.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5453,11 +5454,18 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
   auto IsOne = MIB.buildICmp(
       CmpInst::Predicate::ICMP_EQ,
       Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
-  return MIB.buildSelect(Ty, IsOne, LHS, Q);
+  auto ret = MIB.buildSelect(Ty, IsOne, LHS, Q);
+
+  if (Opcode == TargetOpcode::G_UREM) {
+    auto Prod = MIB.buildMul(Ty, ret, RHS);
+    return MIB.buildSub(Ty, LHS, Prod);
+  }
+  return ret;
 }
 
-bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -5474,7 +5482,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
   if (MF.getFunction().hasMinSize())
     return false;
 
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+  if (Opcode == TargetOpcode::G_UDIV &&
+      MI.getFlag(MachineInstr::MIFlag::IsExact)) {
     return matchUnaryPredicate(
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
@@ -5494,14 +5503,17 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
              {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
               DstTy}}))
       return false;
+    if (Opcode == TargetOpcode::G_UREM &&
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+      return false;
   }
 
   return matchUnaryPredicate(
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyUDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivUsingMul(MI);
+void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildUDivorURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll
index 88bab4af95d64..467ceb062f249 100644
--- a/llvm/test/CodeGen/AArch64/pr58431.ll
+++ b/llvm/test/CodeGen/AArch64/pr58431.ll
@@ -4,10 +4,12 @@
 define i32 @f(i64 %0) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    mov x8, #-7378697629483820647 // =0x9999999999999999
 ; CHECK-NEXT:    mov w9, w0
-; CHECK-NEXT:    udiv x10, x9, x8
-; CHECK-NEXT:    msub x0, x10, x8, x9
+; CHECK-NEXT:    mov w10, #10 // =0xa
+; CHECK-NEXT:    eor x8, x8, #0x8000000000000003
+; CHECK-NEXT:    umulh x8, x9, x8
+; CHECK-NEXT:    msub x0, x8, x10, x9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %2 = trunc i64 %0 to i32

diff  --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
new file mode 100644
index 0000000000000..1376f5d9a380d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -0,0 +1,3616 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i8 @si8_7(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: si8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    mov w9, #-109 // =0xffffff93
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    add w8, w0, w8, lsr #8
+; CHECK-SD-NEXT:    sbfx w9, w8, #2, #6
+; CHECK-SD-NEXT:    and w8, w8, #0x80
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #7
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i8 %a, 7
+  ret i8 %s
+}
+
+define i8 @si8_100(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: si8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    mov w9, #41 // =0x29
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #12
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i8 %a, 100
+  ret i8 %s
+}
+
+define i8 @ui8_7(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: ui8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #37 // =0x25
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i8 %a, 7
+  ret i8 %s
+}
+
+define i8 @ui8_100(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: ui8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #41 // =0x29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #12
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    lsr w8, w8, #4
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i8 %a, 100
+  ret i8 %s
+}
+
+define i16 @si16_7(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: si16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #17
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i16 %a, 7
+  ret i16 %s
+}
+
+define i16 @si16_100(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: si16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #19
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i16 %a, 100
+  ret i16 %s
+}
+
+define i16 @ui16_7(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: ui16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    and w9, w0, #0xffff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #16
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfffe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    and w9, w0, #0xffff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #15
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #14
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i16 %a, 7
+  ret i16 %s
+}
+
+define i16 @ui16_100(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: ui16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ubfx w8, w0, #2, #14
+; CHECK-SD-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #17
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w0, #2, #14
+; CHECK-GI-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    lsr w8, w8, #1
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i16 %a, 100
+  ret i16 %s
+}
+
+define i32 @si32_7(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: si32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w9, w8, #2
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w0, w8
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i32 %a, 7
+  ret i32 %s
+}
+
+define i32 @si32_100(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: si32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w9, w0, w8
+; CHECK-GI-NEXT:    msub w0, w9, w8, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i32 %a, 100
+  ret i32 %s
+}
+
+define i32 @ui32_7(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: ui32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk w8, #9362, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT:    lsr w8, w8, #2
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i32 %a, 7
+  ret i32 %s
+}
+
+define i32 @ui32_100(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: ui32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    lsr w8, w8, #5
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i32 %a, 100
+  ret i32 %s
+}
+
+define i64 @si64_7(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: si64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x8, x0, x8
+; CHECK-SD-NEXT:    asr x9, x8, #1
+; CHECK-SD-NEXT:    add x8, x9, x8, lsr #63
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv x8, x0, x8
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i64 %a, 7
+  ret i64 %s
+}
+
+define i64 @si64_100(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: si64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x8, x0, x8
+; CHECK-SD-NEXT:    add x8, x8, x0
+; CHECK-SD-NEXT:    asr x9, x8, #6
+; CHECK-SD-NEXT:    add x8, x9, x8, lsr #63
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub x0, x8, x9, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv x9, x0, x8
+; CHECK-GI-NEXT:    msub x0, x9, x8, x0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i64 %a, 100
+  ret i64 %s
+}
+
+define i64 @ui64_7(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: ui64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x8, x0, x8
+; CHECK-SD-NEXT:    sub x9, x0, x8
+; CHECK-SD-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x8, x0, x8
+; CHECK-GI-NEXT:    sub x9, x0, x8
+; CHECK-GI-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i64 %a, 7
+  ret i64 %s
+}
+
+define i64 @ui64_100(i64 %a, i64 %b) {
+; CHECK-LABEL: ui64_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x9, #62915 // =0xf5c3
+; CHECK-NEXT:    lsr x8, x0, #2
+; CHECK-NEXT:    movk x9, #23592, lsl #16
+; CHECK-NEXT:    movk x9, #49807, lsl #32
+; CHECK-NEXT:    movk x9, #10485, lsl #48
+; CHECK-NEXT:    umulh x8, x8, x9
+; CHECK-NEXT:    mov w9, #100 // =0x64
+; CHECK-NEXT:    lsr x8, x8, #2
+; CHECK-NEXT:    msub x0, x8, x9, x0
+; CHECK-NEXT:    ret
+entry:
+  %s = urem i64 %a, 100
+  ret i64 %s
+}
+
+define i128 @si128_7(i128 %a, i128 %b) {
+; CHECK-LABEL: si128_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov w2, #7 // =0x7
+; CHECK-NEXT:    mov x3, xzr
+; CHECK-NEXT:    bl __modti3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %s = srem i128 %a, 7
+  ret i128 %s
+}
+
+define i128 @si128_100(i128 %a, i128 %b) {
+; CHECK-LABEL: si128_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov w2, #100 // =0x64
+; CHECK-NEXT:    mov x3, xzr
+; CHECK-NEXT:    bl __modti3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %s = srem i128 %a, 100
+  ret i128 %s
+}
+
+define i128 @ui128_7(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: ui128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x10, #9362 // =0x2492
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x9, x1, x8
+; CHECK-GI-NEXT:    mul x11, x0, x10
+; CHECK-GI-NEXT:    umulh x12, x0, x8
+; CHECK-GI-NEXT:    mul x13, x1, x10
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    umulh x14, x1, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    cmn x9, x12
+; CHECK-GI-NEXT:    and x9, x11, #0x1
+; CHECK-GI-NEXT:    sub x12, x0, x0
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x8
+; CHECK-GI-NEXT:    add x9, x9, x11
+; CHECK-GI-NEXT:    and x11, xzr, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x14
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    umulh x10, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    adds x12, x12, x15
+; CHECK-GI-NEXT:    and x13, x13, #0x1
+; CHECK-GI-NEXT:    umulh x14, x0, xzr
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    adds x9, x12, x9
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    and x12, x15, #0x1
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, x13, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    add x10, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    subs x10, x0, x9
+; CHECK-GI-NEXT:    sbc x11, x1, x8
+; CHECK-GI-NEXT:    lsl x12, x11, #63
+; CHECK-GI-NEXT:    lsr x11, x11, #1
+; CHECK-GI-NEXT:    orr x10, x12, x10, lsr #1
+; CHECK-GI-NEXT:    adds x9, x10, x9
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsl x10, x8, #62
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #2
+; CHECK-GI-NEXT:    mov w10, #7 // =0x7
+; CHECK-GI-NEXT:    lsl x12, x8, #3
+; CHECK-GI-NEXT:    umulh x10, x9, x10
+; CHECK-GI-NEXT:    lsl x11, x9, #3
+; CHECK-GI-NEXT:    sub x8, x12, x8
+; CHECK-GI-NEXT:    sub x9, x11, x9
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    sbc x1, x1, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i128 %a, 7
+  ret i128 %s
+}
+
+define i128 @ui128_100(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: ui128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x10, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x10, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #32
+; CHECK-GI-NEXT:    movk x8, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #48
+; CHECK-GI-NEXT:    mul x9, x1, x8
+; CHECK-GI-NEXT:    mul x11, x0, x10
+; CHECK-GI-NEXT:    umulh x12, x0, x8
+; CHECK-GI-NEXT:    mul x13, x1, x10
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    umulh x14, x1, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    cmn x9, x12
+; CHECK-GI-NEXT:    and x9, x11, #0x1
+; CHECK-GI-NEXT:    sub x12, x0, x0
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x8
+; CHECK-GI-NEXT:    add x9, x9, x11
+; CHECK-GI-NEXT:    and x11, xzr, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x14
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    umulh x10, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    adds x12, x12, x15
+; CHECK-GI-NEXT:    and x13, x13, #0x1
+; CHECK-GI-NEXT:    umulh x14, x0, xzr
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    adds x9, x12, x9
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    and x12, x15, #0x1
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, x13, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    add x10, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    lsl x10, x8, #60
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #4
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    umulh x11, x9, x10
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    madd x8, x8, x10, x11
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    sbc x1, x1, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i128 %a, 100
+  ret i128 %s
+}
+
+define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: sv2i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v1.2s, #24
+; CHECK-SD-NEXT:    smull v2.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #32
+; CHECK-SD-NEXT:    ssra v2.2s, v1.2s, #24
+; CHECK-SD-NEXT:    sshr v1.2s, v2.2s, #2
+; CHECK-SD-NEXT:    usra v1.2s, v2.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    smov w11, v1.h[1]
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i8> %d, <i8 7, i8 7>
+  ret <2 x i8> %s
+}
+
+define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: sv2i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    smov w11, v1.h[1]
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i8> %d, <i8 100, i8 100>
+  ret <2 x i8> %s
+}
+
+define <3 x i8> @sv3i8_7(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: sv3i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    sxtb x8, w0
+; CHECK-SD-NEXT:    mov x9, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    sxtb x10, w1
+; CHECK-SD-NEXT:    sxtb x11, w2
+; CHECK-SD-NEXT:    movk x9, #37449, lsl #16
+; CHECK-SD-NEXT:    sxtb w12, w1
+; CHECK-SD-NEXT:    smull x8, w8, w9
+; CHECK-SD-NEXT:    sxtb w13, w0
+; CHECK-SD-NEXT:    smull x10, w10, w9
+; CHECK-SD-NEXT:    smull x9, w11, w9
+; CHECK-SD-NEXT:    sxtb w11, w2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    add w8, w8, w13
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    asr w14, w8, #2
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    asr w15, w10, #2
+; CHECK-SD-NEXT:    asr w16, w9, #2
+; CHECK-SD-NEXT:    add w8, w14, w8, lsr #31
+; CHECK-SD-NEXT:    add w10, w15, w10, lsr #31
+; CHECK-SD-NEXT:    add w9, w16, w9, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    add w0, w13, w8
+; CHECK-SD-NEXT:    add w1, w12, w10
+; CHECK-SD-NEXT:    add w2, w11, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w13, w2
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w12, w11, w9
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w0, w8, w10
+; CHECK-GI-NEXT:    sdiv w9, w13, w9
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w12, w15, w12
+; CHECK-GI-NEXT:    sub w1, w11, w12
+; CHECK-GI-NEXT:    lsl w16, w9, #3
+; CHECK-GI-NEXT:    sub w9, w16, w9
+; CHECK-GI-NEXT:    sub w2, w13, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i8> %d, <i8 7, i8 7, i8 7>
+  ret <3 x i8> %s
+}
+
+define <3 x i8> @sv3i8_100(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: sv3i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    sxtb x8, w0
+; CHECK-SD-NEXT:    mov w9, #34079 // =0x851f
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    sxtb x10, w1
+; CHECK-SD-NEXT:    movk w9, #20971, lsl #16
+; CHECK-SD-NEXT:    sxtb x11, w2
+; CHECK-SD-NEXT:    sxtb w12, w0
+; CHECK-SD-NEXT:    smull x8, w8, w9
+; CHECK-SD-NEXT:    smull x10, w10, w9
+; CHECK-SD-NEXT:    smull x9, w11, w9
+; CHECK-SD-NEXT:    mov w11, #100 // =0x64
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    msub w0, w8, w11, w12
+; CHECK-SD-NEXT:    sxtb w8, w1
+; CHECK-SD-NEXT:    msub w1, w10, w11, w8
+; CHECK-SD-NEXT:    sxtb w8, w2
+; CHECK-SD-NEXT:    msub w2, w9, w11, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w13, w2
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w12, w11, w9
+; CHECK-GI-NEXT:    msub w0, w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w14, w13, w9
+; CHECK-GI-NEXT:    msub w1, w12, w9, w11
+; CHECK-GI-NEXT:    msub w2, w14, w9, w13
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i8> %d, <i8 100, i8 100, i8 100>
+  ret <3 x i8> %s
+}
+
+define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: sv4i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    smov w11, v0.h[0]
+; CHECK-SD-NEXT:    smov x12, v0.h[2]
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smov x14, v0.h[3]
+; CHECK-SD-NEXT:    smov w16, v0.h[2]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    smull x8, w14, w8
+; CHECK-SD-NEXT:    smov w14, v0.h[3]
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    asr w15, w9, #2
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    asr w17, w10, #2
+; CHECK-SD-NEXT:    add w12, w12, w16
+; CHECK-SD-NEXT:    add w9, w15, w9, lsr #31
+; CHECK-SD-NEXT:    asr w15, w12, #2
+; CHECK-SD-NEXT:    add w8, w8, w14
+; CHECK-SD-NEXT:    add w10, w17, w10, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    add w9, w11, w9
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w13, w10
+; CHECK-SD-NEXT:    add w9, w15, w12, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    asr w10, w8, #2
+; CHECK-SD-NEXT:    add w9, w16, w9
+; CHECK-SD-NEXT:    add w8, w10, w8, lsr #31
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w8, w14, w8
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v3.4h, #7
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w9, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
+  ret <4 x i8> %s
+}
+
+define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: sv4i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    sshr v1.4h, v0.4h, #8
+; CHECK-SD-NEXT:    smov x9, v1.h[0]
+; CHECK-SD-NEXT:    smov x10, v1.h[1]
+; CHECK-SD-NEXT:    smov x11, v1.h[2]
+; CHECK-SD-NEXT:    smov w12, v1.h[0]
+; CHECK-SD-NEXT:    smov x13, v1.h[3]
+; CHECK-SD-NEXT:    smov w15, v1.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    smull x8, w13, w8
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    msub w9, w9, w14, w12
+; CHECK-SD-NEXT:    msub w10, w10, w14, w15
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w9, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w11, v1.h[2]
+; CHECK-SD-NEXT:    msub w9, w9, w14, w11
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    smov w10, v1.h[3]
+; CHECK-SD-NEXT:    msub w8, w8, w14, w10
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v3.4h, #100
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w9, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
+  ret <4 x i8> %s
+}
+
+define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: sv8i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #147
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    add v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    sshr v1.8b, v1.8b, #2
+; CHECK-SD-NEXT:    usra v1.8b, v1.8b, #7
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v4.8b, #7
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <8 x i8> %s
+}
+
+define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: sv8i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #41
+; CHECK-SD-NEXT:    movi v2.8b, #100
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    sshr v1.8b, v1.8b, #4
+; CHECK-SD-NEXT:    usra v1.8b, v1.8b, #7
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v4.8b, #100
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-SD-LABEL: sv16i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.16b, #147
+; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    movi v2.16b, #7
+; CHECK-SD-NEXT:    add v1.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    sshr v1.16b, v1.16b, #2
+; CHECK-SD-NEXT:    usra v1.16b, v1.16b, #7
+; CHECK-SD-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v16.8b, #7
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    fmov w17, s0
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w18, v0.s[1]
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    mov w1, v0.s[3]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[3]
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-SD-LABEL: sv16i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.16b, #41
+; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    movi v2.16b, #100
+; CHECK-SD-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-SD-NEXT:    usra v1.16b, v1.16b, #7
+; CHECK-SD-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v16.8b, #100
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    fmov w17, s0
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w18, v0.s[1]
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    mov w1, v0.s[3]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[3]
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <16 x i8> %s
+}
+
+define <2 x i8> @uv2i8_7(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: uv2i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi v2.2s, #37
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    and v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    mov v2.b[1], w9
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    umov w8, v2.b[0]
+; CHECK-GI-NEXT:    umov w9, v2.b[1]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w9, v1.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i8> %d, <i8 7, i8 7>
+  ret <2 x i8> %s
+}
+
+define <2 x i8> @uv2i8_100(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: uv2i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi v2.2s, #41
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    and v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w9, v1.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i8> %d, <i8 100, i8 100>
+  ret <2 x i8> %s
+}
+
+define <3 x i8> @uv3i8_7(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: uv3i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0xff
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    and w12, w2, #0xff
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w9, w11
+; CHECK-SD-NEXT:    add w1, w10, w13
+; CHECK-SD-NEXT:    add w2, w12, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    mov w10, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    and w9, w2, #0xff
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mul v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    mov b3, v2.b[1]
+; CHECK-GI-NEXT:    mov b4, v2.b[2]
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    add v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    mov b2, v1.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[2]
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-NEXT:    umov w2, v0.h[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i8> %d, <i8 7, i8 7, i8 7>
+  ret <3 x i8> %s
+}
+
+define <3 x i8> @uv3i8_100(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: uv3i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0xff
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    and w12, w2, #0xff
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w0, w11, w14, w9
+; CHECK-SD-NEXT:    msub w1, w13, w14, w10
+; CHECK-SD-NEXT:    msub w2, w8, w14, w12
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    mov w10, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    and w9, w2, #0xff
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w10
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    neg v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    neg v1.8b, v3.8b
+; CHECK-GI-NEXT:    fmov s3, w0
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.h[1], w1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.h[2], w2
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mls v3.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umov w0, v3.h[0]
+; CHECK-GI-NEXT:    umov w1, v3.h[1]
+; CHECK-GI-NEXT:    umov w2, v3.h[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i8> %d, <i8 100, i8 100, i8 100>
+  ret <3 x i8> %s
+}
+
+define <4 x i8> @uv4i8_7(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: uv4i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    umov w13, v0.h[2]
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    umull x14, w13, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    umull x8, w15, w8
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    lsr x9, x14, #32
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    add w8, w15, w8
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ushr v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    mov v4.b[1], w8
+; CHECK-GI-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    mov v4.b[2], w8
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    mov v4.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    usra v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    uzp1 v1.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    dup v2.4h, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
+  ret <4 x i8> %s
+}
+
+define <4 x i8> @uv4i8_100(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: uv4i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    msub w9, w11, w14, w9
+; CHECK-SD-NEXT:    umull x11, w12, w8
+; CHECK-SD-NEXT:    msub w10, w13, w14, w10
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    umull x8, w15, w8
+; CHECK-SD-NEXT:    lsr x9, x11, #32
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    msub w9, w9, w14, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w8, w8, w14, w15
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    dup v2.4h, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
+  ret <4 x i8> %s
+}
+
+define <8 x i8> @uv8i8_7(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: uv8i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #37
+; CHECK-SD-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    sub v2.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-SD-NEXT:    shrn v2.8b, v2.8h, #1
+; CHECK-SD-NEXT:    add v1.8b, v2.8b, v1.8b
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    ushr v1.8b, v1.8b, #2
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.8b, #37
+; CHECK-GI-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    sub v2.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usra v1.8b, v2.8b, #1
+; CHECK-GI-NEXT:    movi v2.8b, #7
+; CHECK-GI-NEXT:    ushr v1.8b, v1.8b, #2
+; CHECK-GI-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <8 x i8> %s
+}
+
+define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-LABEL: uv8i8_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.8b, #41
+; CHECK-NEXT:    movi v2.8b, #100
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    ushr v1.8b, v1.8b, #4
+; CHECK-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @uv16i8_7(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: uv16i8_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.16b, #37
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    usra v1.16b, v2.16b, #1
+; CHECK-NEXT:    movi v2.16b, #7
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #2
+; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @uv16i8_100(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: uv16i8_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.16b, #41
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v2.16b, #100
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #4
+; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <16 x i8> %s
+}
+
+define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: sv2i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v1.2s, #16
+; CHECK-SD-NEXT:    smull v2.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #32
+; CHECK-SD-NEXT:    ssra v2.2s, v1.2s, #16
+; CHECK-SD-NEXT:    sshr v1.2s, v2.2s, #2
+; CHECK-SD-NEXT:    usra v1.2s, v2.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i16> %d, <i16 7, i16 7>
+  ret <2 x i16> %s
+}
+
+define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: sv2i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i16> %d, <i16 100, i16 100>
+  ret <2 x i16> %s
+}
+
+define <3 x i16> @sv3i16_7(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: sv3i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov w12, v0.h[0]
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x8, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.h[2]
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w9, w9, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    asr w14, w9, #2
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    asr w15, w10, #2
+; CHECK-SD-NEXT:    add w8, w8, w11
+; CHECK-SD-NEXT:    add w9, w14, w9, lsr #31
+; CHECK-SD-NEXT:    asr w14, w8, #2
+; CHECK-SD-NEXT:    add w10, w15, w10, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    add w8, w14, w8, lsr #31
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    add w9, w12, w9
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w13, w10
+; CHECK-SD-NEXT:    add w8, w11, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w13, v0.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w9, w9, w10
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w8, w13, w8
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w10, w15, w12
+; CHECK-GI-NEXT:    sub w10, w11, w10
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w8, w13, w8
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i16> %d, <i16 7, i16 7, i16 7>
+  ret <3 x i16> %s
+}
+
+define <3 x i16> @sv3i16_100(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: sv3i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x8, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.h[0]
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    msub w9, w9, w12, w11
+; CHECK-SD-NEXT:    smov w11, v0.h[2]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    msub w10, w10, w12, w13
+; CHECK-SD-NEXT:    msub w8, w8, w12, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w13, v0.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    msub w9, w10, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w14, w13, w8
+; CHECK-GI-NEXT:    msub w10, w12, w8, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    msub w8, w14, w8, w13
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i16> %d, <i16 100, i16 100, i16 100>
+  ret <3 x i16> %s
+}
+
+define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: sv4i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movi v2.4h, #7
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #17
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    usra v1.4h, v1.4h, #15
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v2.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
+  ret <4 x i16> %s
+}
+
+define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: sv4i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    movi v2.4h, #100
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #19
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    usra v1.4h, v1.4h, #15
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v2.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: sv8i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    usra v1.8h, v1.8h, #15
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v4.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: sv8i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #3
+; CHECK-SD-NEXT:    usra v1.8h, v1.8h, #15
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v4.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %s
+}
+
+define <2 x i16> @uv2i16_7(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: uv2i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    neg v3.4h, v3.4h
+; CHECK-GI-NEXT:    ushr v2.2s, v1.2s, #16
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
+; CHECK-GI-NEXT:    ushl v2.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    usra v2.2s, v1.2s, #16
+; CHECK-GI-NEXT:    uzp1 v1.4h, v2.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    dup v2.2s, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i16> %d, <i16 7, i16 7>
+  ret <2 x i16> %s
+}
+
+define <2 x i16> @uv2i16_100(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: uv2i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    ushl v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    dup v2.2s, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i16> %d, <i16 100, i16 100>
+  ret <2 x i16> %s
+}
+
+define <3 x i16> @uv3i16_7(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: uv3i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    add w8, w12, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov w9, #16 // =0x10
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    mov w9, #2 // =0x2
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sub v4.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    neg v3.4h, v3.4h
+; CHECK-GI-NEXT:    ushl v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NEXT:    add v1.4h, v3.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v4.4h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i16> %d, <i16 7, i16 7, i16 7>
+  ret <3 x i16> %s
+}
+
+define <3 x i16> @uv3i16_100(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: uv3i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    msub w9, w11, w14, w9
+; CHECK-SD-NEXT:    lsr x11, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w10, w11, w14, w10
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    msub w8, w8, w14, w12
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov w11, #5243 // =0x147b
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w11
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    umov w10, v1.h[2]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov w8, #16 // =0x10
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w10
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    neg v3.4h, v4.4h
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i16> %d, <i16 100, i16 100, i16 100>
+  ret <3 x i16> %s
+}
+
+define <4 x i16> @uv4i16_7(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: uv4i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-SD-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    shrn v2.4h, v2.4s, #1
+; CHECK-SD-NEXT:    add v1.4h, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.4h, #7
+; CHECK-SD-NEXT:    ushr v1.4h, v1.4h, #2
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI52_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI52_0]
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    usra v1.4h, v2.4h, #1
+; CHECK-GI-NEXT:    movi v2.4h, #7
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #2
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
+  ret <4 x i16> %s
+}
+
+define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: uv4i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.4h, v0.4h, #2
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.4h, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #17
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI53_0
+; CHECK-GI-NEXT:    ushr v1.4h, v0.4h, #2
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.4h, #100
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #1
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @uv8i16_7(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: uv8i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @uv8i16_100(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: uv8i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v3.4s, v2.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI55_0
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI55_0]
+; CHECK-GI-NEXT:    umull2 v3.4s, v1.8h, v2.8h
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.8h, #100
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %s
+}
+
+define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: sv2i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-SD-NEXT:    usra v2.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i32> %d, <i32 7, i32 7>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: sv2i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i32> %d, <i32 100, i32 100>
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: sv3i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w8, w8, w9
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    asr w10, w8, #2
+; CHECK-SD-NEXT:    add w8, w10, w8, lsr #31
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-SD-NEXT:    add w8, w9, w8
+; CHECK-SD-NEXT:    usra v2.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w9, w9, w10
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w8, w13, w8
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w10, w15, w12
+; CHECK-GI-NEXT:    sub w10, w11, w10
+; CHECK-GI-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w8, w13, w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i32> %d, <i32 7, i32 7, i32 7>
+  ret <3 x i32> %s
+}
+
+define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: sv3i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    mov w10, #100 // =0x64
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    msub w8, w8, w10, w9
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    msub w9, w10, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w14, w13, w8
+; CHECK-GI-NEXT:    msub w10, w12, w8, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NEXT:    msub w8, w14, w8, w13
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i32> %d, <i32 100, i32 100, i32 100>
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: sv4i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.4s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    sshr v2.4s, v1.4s, #2
+; CHECK-SD-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: sv4i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v3.4s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sshr v2.4s, v1.4s, #5
+; CHECK-SD-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %s
+}
+
+define <2 x i32> @uv2i32_7(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: uv2i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
+; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI62_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    usra v1.2s, v2.2s, #1
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i32> %d, <i32 7, i32 7>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: uv2i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI63_0
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI63_0]
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #5
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i32> %d, <i32 100, i32 100>
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: uv3i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w10, w9, w8
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    add w8, w8, w10, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    add w8, w9, w8
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
+; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI64_0
+; CHECK-GI-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI64_0]
+; CHECK-GI-NEXT:    mov w8, v0.s[2]
+; CHECK-GI-NEXT:    movk w9, #9362, lsl #16
+; CHECK-GI-NEXT:    mov w10, #1 // =0x1
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    umull x8, w8, w9
+; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov w9, #2 // =0x2
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w10
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    sub v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i32> %d, <i32 7, i32 7, i32 7>
+  ret <3 x i32> %s
+}
+
+define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: uv3i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    mov w10, #100 // =0x64
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w8, w8, w10, w9
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI65_0
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI65_0]
+; CHECK-GI-NEXT:    mov w8, #5 // =0x5
+; CHECK-GI-NEXT:    mov w10, #34079 // =0x851f
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    movk w10, #20971, lsl #16
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    umull x9, w9, w10
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    lsr x8, x9, #32
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
+; CHECK-GI-NEXT:    neg v3.4s, v3.4s
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    fmov s1, w11
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w10
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i32> %d, <i32 100, i32 100, i32 100>
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @uv4i32_7(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: uv4i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-SD-NEXT:    movi v2.4s, #7
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI66_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI66_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @uv4i32_100(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: uv4i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    movi v2.4s, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI67_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI67_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %s
+}
+
+define <2 x i64> @sv2i64_7(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: sv2i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x10, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    asr x12, x11, #1
+; CHECK-SD-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-SD-NEXT:    asr x13, x8, #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    add x8, x13, x8, lsr #63
+; CHECK-SD-NEXT:    add x10, x10, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    add x8, x9, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x8, x10, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI68_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI68_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i64> %d, <i64 7, i64 7>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @sv2i64_100(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: sv2i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x10, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    add x11, x11, x10
+; CHECK-SD-NEXT:    asr x12, x11, #6
+; CHECK-SD-NEXT:    add x8, x8, x9
+; CHECK-SD-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-SD-NEXT:    asr x13, x8, #6
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    msub x10, x11, x12, x10
+; CHECK-SD-NEXT:    add x8, x13, x8, lsr #63
+; CHECK-SD-NEXT:    msub x8, x8, x12, x9
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x8, x10, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI69_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI69_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i64> %d, <i64 100, i64 100>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @uv2i64_7(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: uv2i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x11, x10, x8
+; CHECK-SD-NEXT:    umulh x8, x9, x8
+; CHECK-SD-NEXT:    sub x12, x10, x11
+; CHECK-SD-NEXT:    add x11, x11, x12, lsr #1
+; CHECK-SD-NEXT:    sub x12, x9, x8
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    add x8, x8, x12, lsr #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    add x10, x10, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    add x8, x9, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x9, v0.d[1]
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    umulh x8, x9, x8
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI70_0
+; CHECK-GI-NEXT:    sub v2.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    usra v1.2d, v2.2d, #1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI70_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i64> %d, <i64 7, i64 7>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @uv2i64_100(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: uv2i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #23592, lsl #16
+; CHECK-SD-NEXT:    movk x8, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x11, x10, #2
+; CHECK-SD-NEXT:    movk x8, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x12, x9, #2
+; CHECK-SD-NEXT:    umulh x11, x11, x8
+; CHECK-SD-NEXT:    umulh x8, x12, x8
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    msub x10, x11, x12, x10
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    msub x8, x8, x12, x9
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.2d, v0.2d, #2
+; CHECK-GI-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    umulh x8, x9, x8
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI71_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI71_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i64> %d, <i64 100, i64 100>
+  ret <2 x i64> %s
+}
+
+define <2 x i128> @sv2i128_7(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: sv2i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i128> %d, <i128 7, i128 7>
+  ret <2 x i128> %s
+}
+
+define <2 x i128> @sv2i128_100(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: sv2i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i128> %d, <i128 100, i128 100>
+  ret <2 x i128> %s
+}
+
+define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: uv2i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x10, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x8, #9362 // =0x2492
+; CHECK-GI-NEXT:    sub x4, x0, x0
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    umulh x18, x0, xzr
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x11, x1, x10
+; CHECK-GI-NEXT:    mul x12, x0, x8
+; CHECK-GI-NEXT:    umulh x13, x0, x10
+; CHECK-GI-NEXT:    mul x14, x1, x8
+; CHECK-GI-NEXT:    adds x11, x11, x12
+; CHECK-GI-NEXT:    umulh x15, x1, x10
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    cmn x11, x13
+; CHECK-GI-NEXT:    and x11, x12, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    add x14, x14, x4
+; CHECK-GI-NEXT:    and x12, x12, #0x1
+; CHECK-GI-NEXT:    and x4, xzr, #0x1
+; CHECK-GI-NEXT:    mul x13, x3, x10
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    adds x14, x14, x15
+; CHECK-GI-NEXT:    add x12, x12, x4
+; CHECK-GI-NEXT:    mul x5, x2, x8
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    adds x14, x14, x16
+; CHECK-GI-NEXT:    and x16, x4, #0x1
+; CHECK-GI-NEXT:    umulh x9, xzr, x10
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    adds x11, x14, x11
+; CHECK-GI-NEXT:    add x12, x12, x16
+; CHECK-GI-NEXT:    and x16, x4, #0x1
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    umulh x17, x1, x8
+; CHECK-GI-NEXT:    add x12, x12, x16
+; CHECK-GI-NEXT:    adds x13, x13, x5
+; CHECK-GI-NEXT:    umulh x15, x2, x10
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    and x16, x4, #0x1
+; CHECK-GI-NEXT:    mul x6, x3, x8
+; CHECK-GI-NEXT:    umulh x10, x3, x10
+; CHECK-GI-NEXT:    cmn x13, x15
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    add x14, x9, x17
+; CHECK-GI-NEXT:    umulh x15, x2, x8
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    add x13, x14, x18
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    sub x17, x2, x2
+; CHECK-GI-NEXT:    and x18, xzr, #0x1
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    add x14, x16, x14
+; CHECK-GI-NEXT:    add x16, x6, x17
+; CHECK-GI-NEXT:    and x17, xzr, #0x1
+; CHECK-GI-NEXT:    adds x10, x16, x10
+; CHECK-GI-NEXT:    add x17, x17, x18
+; CHECK-GI-NEXT:    cset w16, hs
+; CHECK-GI-NEXT:    adds x10, x10, x15
+; CHECK-GI-NEXT:    umulh x15, x2, xzr
+; CHECK-GI-NEXT:    and x16, x16, #0x1
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x10, x10, x14
+; CHECK-GI-NEXT:    add x16, x17, x16
+; CHECK-GI-NEXT:    and x17, x18, #0x1
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    add x13, x16, x17
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    subs x9, x0, x11
+; CHECK-GI-NEXT:    add x13, x13, x14
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    sbc x14, x1, x12
+; CHECK-GI-NEXT:    add x8, x8, x13
+; CHECK-GI-NEXT:    subs x13, x2, x10
+; CHECK-GI-NEXT:    lsl x15, x14, #63
+; CHECK-GI-NEXT:    sbc x16, x3, x8
+; CHECK-GI-NEXT:    lsr x14, x14, #1
+; CHECK-GI-NEXT:    orr x9, x15, x9, lsr #1
+; CHECK-GI-NEXT:    lsl x15, x16, #63
+; CHECK-GI-NEXT:    orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    lsr x11, x16, #1
+; CHECK-GI-NEXT:    adc x12, x14, x12
+; CHECK-GI-NEXT:    adds x10, x13, x10
+; CHECK-GI-NEXT:    lsl x13, x12, #62
+; CHECK-GI-NEXT:    lsr x12, x12, #2
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsl x11, x8, #62
+; CHECK-GI-NEXT:    orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT:    mov w13, #7 // =0x7
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x14, x12, #3
+; CHECK-GI-NEXT:    orr x10, x11, x10, lsr #2
+; CHECK-GI-NEXT:    umulh x11, x9, x13
+; CHECK-GI-NEXT:    lsl x15, x9, #3
+; CHECK-GI-NEXT:    sub x12, x14, x12
+; CHECK-GI-NEXT:    lsl x16, x8, #3
+; CHECK-GI-NEXT:    umulh x13, x10, x13
+; CHECK-GI-NEXT:    lsl x14, x10, #3
+; CHECK-GI-NEXT:    sub x9, x15, x9
+; CHECK-GI-NEXT:    sub x8, x16, x8
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    sub x10, x14, x10
+; CHECK-GI-NEXT:    add x11, x12, x11
+; CHECK-GI-NEXT:    sbc x1, x1, x11
+; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    add x8, x8, x13
+; CHECK-GI-NEXT:    sbc x3, x3, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i128> %d, <i128 7, i128 7>
+  ret <2 x i128> %s
+}
+
+define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: uv2i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x8, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    sub x18, x0, x0
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mul x11, x1, x10
+; CHECK-GI-NEXT:    mul x12, x0, x8
+; CHECK-GI-NEXT:    umulh x13, x0, x10
+; CHECK-GI-NEXT:    mul x14, x1, x8
+; CHECK-GI-NEXT:    adds x11, x11, x12
+; CHECK-GI-NEXT:    umulh x15, x1, x10
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    cmn x11, x13
+; CHECK-GI-NEXT:    and x11, x12, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    and x12, x12, #0x1
+; CHECK-GI-NEXT:    add x14, x14, x18
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x9, xzr, x10
+; CHECK-GI-NEXT:    adds x14, x14, x15
+; CHECK-GI-NEXT:    and x15, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x17, x1, x8
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    add x15, x12, x15
+; CHECK-GI-NEXT:    adds x12, x14, x16
+; CHECK-GI-NEXT:    and x4, x4, #0x1
+; CHECK-GI-NEXT:    mul x18, x3, x10
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    adds x12, x12, x11
+; CHECK-GI-NEXT:    add x11, x15, x4
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    mul x5, x2, x8
+; CHECK-GI-NEXT:    add x11, x11, x14
+; CHECK-GI-NEXT:    and x14, x15, #0x1
+; CHECK-GI-NEXT:    add x17, x9, x17
+; CHECK-GI-NEXT:    add x14, x11, x14
+; CHECK-GI-NEXT:    mov w11, #100 // =0x64
+; CHECK-GI-NEXT:    umulh x13, x0, xzr
+; CHECK-GI-NEXT:    umulh x16, x2, x10
+; CHECK-GI-NEXT:    adds x18, x18, x5
+; CHECK-GI-NEXT:    mul x15, x3, x8
+; CHECK-GI-NEXT:    add x13, x17, x13
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    umulh x10, x3, x10
+; CHECK-GI-NEXT:    add x13, x13, x14
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    cmn x18, x16
+; CHECK-GI-NEXT:    sub x18, x2, x2
+; CHECK-GI-NEXT:    umulh x16, x2, x8
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    add x15, x15, x18
+; CHECK-GI-NEXT:    and x18, xzr, #0x1
+; CHECK-GI-NEXT:    add x14, x17, x14
+; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    and x17, xzr, #0x1
+; CHECK-GI-NEXT:    adds x10, x15, x10
+; CHECK-GI-NEXT:    add x15, x17, x18
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    umulh x18, x2, xzr
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    adds x10, x10, x16
+; CHECK-GI-NEXT:    lsl x16, x13, #60
+; CHECK-GI-NEXT:    add x15, x15, x17
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    adds x10, x10, x14
+; CHECK-GI-NEXT:    and x14, x17, #0x1
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    add x14, x15, x14
+; CHECK-GI-NEXT:    and x15, x17, #0x1
+; CHECK-GI-NEXT:    orr x12, x16, x12, lsr #4
+; CHECK-GI-NEXT:    add x9, x14, x15
+; CHECK-GI-NEXT:    add x8, x8, x18
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    lsr x9, x13, #4
+; CHECK-GI-NEXT:    umulh x14, x12, x11
+; CHECK-GI-NEXT:    lsl x13, x8, #60
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    mul x12, x12, x11
+; CHECK-GI-NEXT:    orr x10, x13, x10, lsr #4
+; CHECK-GI-NEXT:    madd x9, x9, x11, x14
+; CHECK-GI-NEXT:    umulh x13, x10, x11
+; CHECK-GI-NEXT:    subs x0, x0, x12
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    sbc x1, x1, x9
+; CHECK-GI-NEXT:    madd x8, x8, x11, x13
+; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    sbc x3, x3, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i128> %d, <i128 100, i128 100>
+  ret <2 x i128> %s
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 24ec4fa48f778..6ae2f56f6ae6d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -211,91 +211,41 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, 0x4996c7d8
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0xb2a50881
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 20, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 1235195
   ret i32 %result
 }
 
 define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
-; GISEL-LABEL: v_urem_v2i32_oddk_denom:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_urem_v2i32_oddk_denom:
-; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CGP-NEXT:    v_mov_b32_e32 v3, 0xffed2705
-; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v2, v3
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v0, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_urem_v2i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0xb2a50881
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v4, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v4
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v1, v2
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 1, v6
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 20, v4
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f6a228614a27e..2a1bf4bf068f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -968,523 +968,106 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0xffed2705
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v3
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0xd9528440
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, vcc, v1, v3, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[6:7]
-; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, 0x12d8fb, v5
-; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v2, s[4:5]
-; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_lshr_b64 v[2:3], v[2:3], 20
+; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 1235195
   ret i64 %result
 }
 
 define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
-; GISEL-LABEL: v_urem_v2i64_oddk_denom:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    s_mov_b32 s4, 1
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
-; GISEL-NEXT:    s_mov_b32 s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v6
-; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v6
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v17, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v7, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v16, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v18, v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v11, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v14, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v18, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v1, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v17, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    s_mov_b64 s[4:5], vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9
-; GISEL-NEXT:    v_sub_i32_e64 v12, s[6:7], v0, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v13, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v10, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_urem_v2i64_oddk_denom:
-; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; CGP-NEXT:    v_mov_b32_e32 v7, 0xffed2705
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v7, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, v7
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v2, v6
-; CGP-NEXT:    v_mul_lo_u32 v17, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v18, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v9
-; CGP-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v11
-; CGP-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
-; CGP-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    s_mov_b64 s[4:5], vcc
-; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9
-; CGP-NEXT:    v_sub_i32_e64 v12, s[6:7], v0, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v13, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v10, -1, v10, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
-; CGP-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_urem_v2i64_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0xd9528440
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v4
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v13, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v14, v3, v4
+; CHECK-NEXT:    v_mul_lo_u32 v15, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v16, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v17, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v18, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v19, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v4, v18
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v14, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v15, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v19, v7
+; CHECK-NEXT:    v_lshr_b64 v[4:5], v[4:5], 20
+; CHECK-NEXT:    v_lshr_b64 v[6:7], v[6:7], 20
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v5, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
 }


        


More information about the llvm-commits mailing list