[llvm] [GlobalISel] Allow expansion of urem by constant in prelegalizer (PR #145914)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 30 06:34:22 PDT 2025


https://github.com/jyli0116 updated https://github.com/llvm/llvm-project/pull/145914

>From 8ba8f12fbb65de31bb26f864aa85d48885fc6dd8 Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 26 Jun 2025 14:47:36 +0000
Subject: [PATCH 1/5] [GlobalIsel] Allow expansion of urem by constant in
 prelegalizer

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  10 +-
 .../include/llvm/Target/GlobalISel/Combine.td |  10 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  69 +-
 .../AArch64/GlobalISel/combine_urem.ll        | 243 +++++++
 llvm/test/CodeGen/AArch64/pr58431.ll          |   8 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll     | 104 +--
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     | 595 +++---------------
 7 files changed, 441 insertions(+), 598 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index c15263e0b06f8..9139425658480 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -693,18 +693,22 @@ class CombinerHelper {
   /// feeding a G_AND instruction \p MI.
   bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  /// Given an G_UDIV \p MI expressing a divide by constant, return an
-  /// expression that implements it by multiplying by a magic number.
+  /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
+  /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivUsingMul(MachineInstr &MI) const;
+  MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
   /// Combine G_UDIV by constant into a multiply by magic constant.
   bool matchUDivByConst(MachineInstr &MI) const;
   void applyUDivByConst(MachineInstr &MI) const;
+  /// Combine G_UREM by constant into a multiply by magic constant.
+  bool matchURemByConst(MachineInstr &MI) const;
+  void applyURemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
   MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
+  /// Combine G_SDIV by constant into a multiply by magic constant.
   bool matchSDivByConst(MachineInstr &MI) const;
   void applySDivByConst(MachineInstr &MI) const;
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4a92dc16c1bf4..52cbbf91849b6 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1156,6 +1156,14 @@ def udiv_by_pow2 : GICombineRule<
 def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
                                       sdiv_by_pow2, udiv_by_pow2]>;
 
+def urem_by_const : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_UREM):$root,
+   [{ return Helper.matchURemByConst(*${root}); }]),
+  (apply [{ Helper.applyURemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const]>;
+
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
   (match (wip_match_opcode G_PTR_ADD):$root,
@@ -2048,7 +2056,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     constant_fold_cast_op, fabs_fneg_fold,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
-    sub_add_reg, select_to_minmax,
+    intrem_combines, sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     simplify_neg_minmax, combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b1e851183de0d..c511f27a5e8ce 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5288,12 +5288,13 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
-  auto &UDiv = cast<GenericMachineInstr>(MI);
-  Register Dst = UDiv.getReg(0);
-  Register LHS = UDiv.getReg(1);
-  Register RHS = UDiv.getReg(2);
+MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+  unsigned opcode = MI.getOpcode();
+  assert(opcode == TargetOpcode::G_UDIV || opcode == TargetOpcode::G_UREM);
+  auto &UDivorRem = cast<GenericMachineInstr>(MI);
+  Register Dst = UDivorRem.getReg(0);
+  Register LHS = UDivorRem.getReg(1);
+  Register RHS = UDivorRem.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5446,7 +5447,13 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
   auto IsOne = MIB.buildICmp(
       CmpInst::Predicate::ICMP_EQ,
       Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
-  return MIB.buildSelect(Ty, IsOne, LHS, Q);
+  auto ret = MIB.buildSelect(Ty, IsOne, LHS, Q);
+
+  if (opcode == TargetOpcode::G_UREM) {
+    auto Prod = MIB.buildMul(Ty, ret, RHS);
+    return MIB.buildSub(Ty, LHS, Prod);
+  }
+  return ret;
 }
 
 bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
@@ -5494,7 +5501,53 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
 }
 
 void CombinerHelper::applyUDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivUsingMul(MI);
+  auto *NewMI = buildUDivorURemUsingMul(MI);
+  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
+}
+
+bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UREM);
+  Register Dst = MI.getOperand(0).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(Dst);
+
+  auto &MF = *MI.getMF();
+  AttributeList Attr = MF.getFunction().getAttributes();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, Ctx), Attr))
+    return false;
+
+  // Don't do this for minsize because the instruction sequence is usually
+  // larger.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  auto *RHSDef = MRI.getVRegDef(RHS);
+  if (!isConstantOrConstantVector(*RHSDef, MRI))
+    return false;
+
+  // Don't do this if the types are not going to be legal.
+  if (LI) {
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_ICMP,
+             {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
+              DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+        return false;
+  }
+
+  return matchUnaryPredicate(
+      MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
+}
+
+void CombinerHelper::applyURemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildUDivorURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll
new file mode 100644
index 0000000000000..0cf827410c30c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=CHECK-GI
+
+
+define i8 @test7s8(i8 %a) {
+; CHECK-SD-LABEL: test7s8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #37 // =0x25
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7s8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+  %r = urem i8 %a, 7
+  ret i8 %r
+}
+
+define i8 @test100s8(i8 %a) {
+; CHECK-SD-LABEL: test100s8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #41 // =0x29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #12
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100s8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    lsr w8, w8, #4
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+  %r = urem i8 %a, 100
+  ret i8 %r
+}
+
+define i32 @test7s32(i32 %a) {
+; CHECK-SD-LABEL: test7s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk w8, #9362, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT:    lsr w8, w8, #2
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+ %c = urem i32 %a, 7
+ ret i32 %c
+}
+
+define i32 @test100s32(i32 %a) {
+; CHECK-SD-LABEL: test100s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    lsr w8, w8, #5
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+ %c = urem i32 %a, 100
+ ret i32 %c
+}
+
+define <8 x i16> @test7v8s16(<8 x i16> %a) {
+; CHECK-SD-LABEL: test7v8s16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7v8s16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI4_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-GI-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+  %r = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @test100v8s16(<8 x i16> %a) {
+; CHECK-SD-LABEL: test100v8s16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v3.4s, v2.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100v8s16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI5_0
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-NEXT:    umull2 v3.4s, v1.8h, v2.8h
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.8h, #100
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+  %r = urem <8 x i16> %a, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @test7v4s32(<4 x i32> %a) {
+; CHECK-SD-LABEL: test7v4s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-SD-NEXT:    movi v2.4s, #7
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test7v4s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI6_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+  %r = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test100v4s32(<4 x i32> %a) {
+; CHECK-SD-LABEL: test100v4s32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    movi v2.4s, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test100v4s32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI7_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+  %r = urem <4 x i32> %a, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %r
+}
+
diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll
index 88bab4af95d64..467ceb062f249 100644
--- a/llvm/test/CodeGen/AArch64/pr58431.ll
+++ b/llvm/test/CodeGen/AArch64/pr58431.ll
@@ -4,10 +4,12 @@
 define i32 @f(i64 %0) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    mov x8, #-7378697629483820647 // =0x9999999999999999
 ; CHECK-NEXT:    mov w9, w0
-; CHECK-NEXT:    udiv x10, x9, x8
-; CHECK-NEXT:    msub x0, x10, x8, x9
+; CHECK-NEXT:    mov w10, #10 // =0xa
+; CHECK-NEXT:    eor x8, x8, #0x8000000000000003
+; CHECK-NEXT:    umulh x8, x9, x8
+; CHECK-NEXT:    msub x0, x8, x10, x9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %2 = trunc i64 %0 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 24ec4fa48f778..6ae2f56f6ae6d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -211,91 +211,41 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, 0x4996c7d8
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0xb2a50881
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 20, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 1235195
   ret i32 %result
 }
 
 define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
-; GISEL-LABEL: v_urem_v2i32_oddk_denom:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_urem_v2i32_oddk_denom:
-; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CGP-NEXT:    v_mov_b32_e32 v3, 0xffed2705
-; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v2, v3
-; CGP-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v0, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_urem_v2i32_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0xb2a50881
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v4, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v4
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v1, v2
+; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 1, v6
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 20, v4
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f6a228614a27e..2a1bf4bf068f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -968,523 +968,106 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0xffed2705
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v3
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0xd9528440
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, vcc, v1, v3, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v2
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[6:7]
-; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, 0x12d8fb, v5
-; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v2, s[4:5]
-; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_lshr_b64 v[2:3], v[2:3], 20
+; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 1235195
   ret i64 %result
 }
 
 define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
-; GISEL-LABEL: v_urem_v2i64_oddk_denom:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    s_mov_b32 s4, 1
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
-; GISEL-NEXT:    s_mov_b32 s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v6
-; GISEL-NEXT:    v_mul_hi_u32 v11, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v6
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v17, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v7, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v16, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v18, v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v19, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v11, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v5
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v7, v5
-; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v14, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], v18, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v1, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v17, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    s_mov_b64 s[4:5], vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9
-; GISEL-NEXT:    v_sub_i32_e64 v12, s[6:7], v0, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v13, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v10, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_urem_v2i64_oddk_denom:
-; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; CGP-NEXT:    v_mov_b32_e32 v7, 0xffed2705
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v7, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, v7
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v2, v6
-; CGP-NEXT:    v_mul_lo_u32 v17, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v18, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v9
-; CGP-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v11
-; CGP-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
-; CGP-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    s_mov_b64 s[4:5], vcc
-; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9
-; CGP-NEXT:    v_sub_i32_e64 v12, s[6:7], v0, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
-; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v13, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v10, -1, v10, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
-; CGP-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_urem_v2i64_oddk_denom:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0xd9528440
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v4
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v13, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v14, v3, v4
+; CHECK-NEXT:    v_mul_lo_u32 v15, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v16, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v17, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v18, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v19, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v4, v18
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v14, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v15, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v19, v7
+; CHECK-NEXT:    v_lshr_b64 v[4:5], v[4:5], 20
+; CHECK-NEXT:    v_lshr_b64 v[6:7], v[6:7], 20
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v5, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v4, v4, v8
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v8
+; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
 }

>From 395379e4f133c0398a83e5d9e0c9ae0235972e1c Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 26 Jun 2025 15:00:01 +0000
Subject: [PATCH 2/5] formatting

---
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index c511f27a5e8ce..6ef23a68092db 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5539,7 +5539,7 @@ bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
               DstTy}}))
       return false;
     if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
-        return false;
+      return false;
   }
 
   return matchUnaryPredicate(

>From c5c3e3649cf055db9c58508fa3a3b06c47e94f6c Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 26 Jun 2025 17:56:30 +0000
Subject: [PATCH 3/5] [GlobalISel] Combine matchUDiv/URem and applyUDiv/URem
 for less repeated code

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  9 +--
 .../include/llvm/Target/GlobalISel/Combine.td |  8 +--
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 63 ++++---------------
 3 files changed, 18 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9139425658480..7d7b5364d6b68 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -697,12 +697,9 @@ class CombinerHelper {
   /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
   MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
-  /// Combine G_UDIV by constant into a multiply by magic constant.
-  bool matchUDivByConst(MachineInstr &MI) const;
-  void applyUDivByConst(MachineInstr &MI) const;
-  /// Combine G_UREM by constant into a multiply by magic constant.
-  bool matchURemByConst(MachineInstr &MI) const;
-  void applyURemByConst(MachineInstr &MI) const;
+  /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
+  bool matchUDivorURemByConst(MachineInstr &MI) const;
+  void applyUDivorURemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 52cbbf91849b6..84675b41c063e 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,8 +1132,8 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
 def udiv_by_const : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UDIV):$root,
-   [{ return Helper.matchUDivByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivByConst(*${root}); }])>;
+   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
 
 def sdiv_by_const : GICombineRule<
   (defs root:$root),
@@ -1159,8 +1159,8 @@ def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
 def urem_by_const : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UREM):$root,
-   [{ return Helper.matchURemByConst(*${root}); }]),
-  (apply [{ Helper.applyURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
 
 def intrem_combines : GICombineGroup<[urem_by_const]>;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 6ef23a68092db..94d1a347a8b0c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5289,8 +5289,8 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
 }
 
 MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
-  unsigned opcode = MI.getOpcode();
-  assert(opcode == TargetOpcode::G_UDIV || opcode == TargetOpcode::G_UREM);
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   auto &UDivorRem = cast<GenericMachineInstr>(MI);
   Register Dst = UDivorRem.getReg(0);
   Register LHS = UDivorRem.getReg(1);
@@ -5449,15 +5449,16 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
       Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
   auto ret = MIB.buildSelect(Ty, IsOne, LHS, Q);
 
-  if (opcode == TargetOpcode::G_UREM) {
+  if (Opcode == TargetOpcode::G_UREM) {
     auto Prod = MIB.buildMul(Ty, ret, RHS);
     return MIB.buildSub(Ty, LHS, Prod);
   }
   return ret;
 }
 
-bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -5474,7 +5475,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
   if (MF.getFunction().hasMinSize())
     return false;
 
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+  if (Opcode == TargetOpcode::G_UDIV &&
+      MI.getFlag(MachineInstr::MIFlag::IsExact)) {
     return matchUnaryPredicate(
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
@@ -5494,51 +5496,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
              {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
               DstTy}}))
       return false;
-  }
-
-  return matchUnaryPredicate(
-      MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
-}
-
-void CombinerHelper::applyUDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivorURemUsingMul(MI);
-  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
-}
-
-bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_UREM);
-  Register Dst = MI.getOperand(0).getReg();
-  Register RHS = MI.getOperand(2).getReg();
-  LLT DstTy = MRI.getType(Dst);
-
-  auto &MF = *MI.getMF();
-  AttributeList Attr = MF.getFunction().getAttributes();
-  const auto &TLI = getTargetLowering();
-  LLVMContext &Ctx = MF.getFunction().getContext();
-  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, Ctx), Attr))
-    return false;
-
-  // Don't do this for minsize because the instruction sequence is usually
-  // larger.
-  if (MF.getFunction().hasMinSize())
-    return false;
-
-  auto *RHSDef = MRI.getVRegDef(RHS);
-  if (!isConstantOrConstantVector(*RHSDef, MRI))
-    return false;
-
-  // Don't do this if the types are not going to be legal.
-  if (LI) {
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
-      return false;
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
-      return false;
-    if (!isLegalOrBeforeLegalizer(
-            {TargetOpcode::G_ICMP,
-             {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
-              DstTy}}))
-      return false;
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+    if (Opcode == TargetOpcode::G_UREM &&
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
       return false;
   }
 
@@ -5546,7 +5505,7 @@ bool CombinerHelper::matchURemByConst(MachineInstr &MI) const {
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyURemByConst(MachineInstr &MI) const {
+void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
   auto *NewMI = buildUDivorURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }

>From bbf420e34ca7518f291ff56f10e2128ee3e29a7d Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Mon, 30 Jun 2025 08:54:29 +0000
Subject: [PATCH 4/5] [GISel] removed wip_match_opcode, explicitly disabled
 global-isel in RUN line

---
 llvm/include/llvm/Target/GlobalISel/Combine.td                  | 2 +-
 .../AArch64/GlobalISel/{combine_urem.ll => combine-urem.ll}     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename llvm/test/CodeGen/AArch64/GlobalISel/{combine_urem.ll => combine-urem.ll} (98%)

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 84675b41c063e..6033d80e717d3 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1158,7 +1158,7 @@ def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const,
 
 def urem_by_const : GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_UREM):$root,
+  (match (G_UREM $dst, $x, $y):$root,
    [{ return Helper.matchUDivorURemByConst(*${root}); }]),
   (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll
similarity index 98%
rename from llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll
rename to llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll
index 0cf827410c30c..254c16412a5da 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine_urem.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=false | FileCheck %s --check-prefixes=CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=CHECK-GI
 
 

>From 1a1d1e6864b90c3d13acba34421a34b77bcd4228 Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Mon, 30 Jun 2025 13:34:02 +0000
Subject: [PATCH 5/5] [GISel] Added unit tests in AArch64 for UREM/SREM

---
 .../AArch64/GlobalISel/combine-urem.ll        |  243 -
 llvm/test/CodeGen/AArch64/rem-by-const.ll     | 7966 +++++++++++++++++
 2 files changed, 7966 insertions(+), 243 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll
 create mode 100644 llvm/test/CodeGen/AArch64/rem-by-const.ll

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll
deleted file mode 100644
index 254c16412a5da..0000000000000
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-urem.ll
+++ /dev/null
@@ -1,243 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel=false | FileCheck %s --check-prefixes=CHECK-SD
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=CHECK-GI
-
-
-define i8 @test7s8(i8 %a) {
-; CHECK-SD-LABEL: test7s8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #37 // =0x25
-; CHECK-SD-NEXT:    and w9, w0, #0xff
-; CHECK-SD-NEXT:    mul w8, w9, w8
-; CHECK-SD-NEXT:    lsr w8, w8, #8
-; CHECK-SD-NEXT:    sub w9, w0, w8
-; CHECK-SD-NEXT:    and w9, w9, #0xfe
-; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT:    lsr w8, w8, #2
-; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
-; CHECK-SD-NEXT:    add w0, w0, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test7s8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #37 // =0x25
-; CHECK-GI-NEXT:    and w9, w0, #0xff
-; CHECK-GI-NEXT:    mul w8, w9, w8
-; CHECK-GI-NEXT:    lsr w8, w8, #8
-; CHECK-GI-NEXT:    sub w9, w0, w8
-; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
-; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
-; CHECK-GI-NEXT:    lsl w9, w8, #3
-; CHECK-GI-NEXT:    sub w8, w9, w8
-; CHECK-GI-NEXT:    sub w0, w0, w8
-; CHECK-GI-NEXT:    ret
-  %r = urem i8 %a, 7
-  ret i8 %r
-}
-
-define i8 @test100s8(i8 %a) {
-; CHECK-SD-LABEL: test100s8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #41 // =0x29
-; CHECK-SD-NEXT:    and w9, w0, #0xff
-; CHECK-SD-NEXT:    mul w8, w9, w8
-; CHECK-SD-NEXT:    mov w9, #100 // =0x64
-; CHECK-SD-NEXT:    lsr w8, w8, #12
-; CHECK-SD-NEXT:    msub w0, w8, w9, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test100s8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #41 // =0x29
-; CHECK-GI-NEXT:    and w9, w0, #0xff
-; CHECK-GI-NEXT:    mul w8, w9, w8
-; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    lsr w8, w8, #8
-; CHECK-GI-NEXT:    lsr w8, w8, #4
-; CHECK-GI-NEXT:    msub w0, w8, w9, w0
-; CHECK-GI-NEXT:    ret
-  %r = urem i8 %a, 100
-  ret i8 %r
-}
-
-define i32 @test7s32(i32 %a) {
-; CHECK-SD-LABEL: test7s32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
-; CHECK-SD-NEXT:    umull x8, w0, w8
-; CHECK-SD-NEXT:    lsr x8, x8, #32
-; CHECK-SD-NEXT:    sub w9, w0, w8
-; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT:    lsr w8, w8, #2
-; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
-; CHECK-SD-NEXT:    add w0, w0, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test7s32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
-; CHECK-GI-NEXT:    movk w8, #9362, lsl #16
-; CHECK-GI-NEXT:    umull x8, w0, w8
-; CHECK-GI-NEXT:    lsr x8, x8, #32
-; CHECK-GI-NEXT:    sub w9, w0, w8
-; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-GI-NEXT:    lsr w8, w8, #2
-; CHECK-GI-NEXT:    lsl w9, w8, #3
-; CHECK-GI-NEXT:    sub w8, w9, w8
-; CHECK-GI-NEXT:    sub w0, w0, w8
-; CHECK-GI-NEXT:    ret
- %c = urem i32 %a, 7
- ret i32 %c
-}
-
-define i32 @test100s32(i32 %a) {
-; CHECK-SD-LABEL: test100s32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT:    mov w9, #100 // =0x64
-; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
-; CHECK-SD-NEXT:    umull x8, w0, w8
-; CHECK-SD-NEXT:    lsr x8, x8, #37
-; CHECK-SD-NEXT:    msub w0, w8, w9, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test100s32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
-; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
-; CHECK-GI-NEXT:    umull x8, w0, w8
-; CHECK-GI-NEXT:    lsr x8, x8, #32
-; CHECK-GI-NEXT:    lsr w8, w8, #5
-; CHECK-GI-NEXT:    msub w0, w8, w9, w0
-; CHECK-GI-NEXT:    ret
- %c = urem i32 %a, 100
- ret i32 %c
-}
-
-define <8 x i16> @test7v8s16(<8 x i16> %a) {
-; CHECK-SD-LABEL: test7v8s16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
-; CHECK-SD-NEXT:    dup v1.8h, w8
-; CHECK-SD-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-SD-NEXT:    sub v2.8h, v0.8h, v1.8h
-; CHECK-SD-NEXT:    usra v1.8h, v2.8h, #1
-; CHECK-SD-NEXT:    movi v2.8h, #7
-; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #2
-; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test7v8s16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI4_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-GI-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    sub v2.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    usra v1.8h, v2.8h, #1
-; CHECK-GI-NEXT:    movi v2.8h, #7
-; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #2
-; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    ret
-  %r = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-  ret <8 x i16> %r
-}
-
-define <8 x i16> @test100v8s16(<8 x i16> %a) {
-; CHECK-SD-LABEL: test100v8s16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
-; CHECK-SD-NEXT:    ushr v2.8h, v0.8h, #2
-; CHECK-SD-NEXT:    dup v1.8h, w8
-; CHECK-SD-NEXT:    umull2 v3.4s, v2.8h, v1.8h
-; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
-; CHECK-SD-NEXT:    movi v2.8h, #100
-; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
-; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
-; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test100v8s16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI5_0
-; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #2
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
-; CHECK-GI-NEXT:    umull2 v3.4s, v1.8h, v2.8h
-; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    movi v2.8h, #100
-; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #1
-; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    ret
-  %r = urem <8 x i16> %a, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
-  ret <8 x i16> %r
-}
-
-define <4 x i32> @test7v4s32(<4 x i32> %a) {
-; CHECK-SD-LABEL: test7v4s32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
-; CHECK-SD-NEXT:    dup v1.4s, w8
-; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
-; CHECK-SD-NEXT:    sub v2.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT:    usra v1.4s, v2.4s, #1
-; CHECK-SD-NEXT:    movi v2.4s, #7
-; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #2
-; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test7v4s32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI6_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
-; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    sub v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    usra v1.4s, v2.4s, #1
-; CHECK-GI-NEXT:    movi v2.4s, #7
-; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #2
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    ret
-  %r = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
-  ret <4 x i32> %r
-}
-
-define <4 x i32> @test100v4s32(<4 x i32> %a) {
-; CHECK-SD-LABEL: test100v4s32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
-; CHECK-SD-NEXT:    dup v1.4s, w8
-; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
-; CHECK-SD-NEXT:    movi v2.4s, #100
-; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #5
-; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test100v4s32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI7_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    movi v2.4s, #100
-; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #5
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    ret
-  %r = urem <4 x i32> %a, <i32 100, i32 100, i32 100, i32 100>
-  ret <4 x i32> %r
-}
-
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
new file mode 100644
index 0000000000000..0db62302930df
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -0,0 +1,7966 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i8 @si8_7(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: si8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    mov w9, #-109 // =0xffffff93
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    add w8, w0, w8, lsr #8
+; CHECK-SD-NEXT:    sbfx w9, w8, #2, #6
+; CHECK-SD-NEXT:    and w8, w8, #0x80
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #7
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i8 %a, 7
+  ret i8 %s
+}
+
+define i8 @si8_100(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: si8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    mov w9, #41 // =0x29
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #12
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i8 %a, 100
+  ret i8 %s
+}
+
+define i8 @ui8_7(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: ui8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #37 // =0x25
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i8 %a, 7
+  ret i8 %s
+}
+
+define i8 @ui8_100(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: ui8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #41 // =0x29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #12
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    lsr w8, w8, #4
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i8 %a, 100
+  ret i8 %s
+}
+
+define i16 @si16_7(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: si16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #17
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i16 %a, 7
+  ret i16 %s
+}
+
+define i16 @si16_100(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: si16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #19
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i16 %a, 100
+  ret i16 %s
+}
+
+define i16 @ui16_7(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: ui16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    and w9, w0, #0xffff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #16
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfffe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    and w9, w0, #0xffff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #15
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #14
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i16 %a, 7
+  ret i16 %s
+}
+
+define i16 @ui16_100(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: ui16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ubfx w8, w0, #2, #14
+; CHECK-SD-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #17
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w0, #2, #14
+; CHECK-GI-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    lsr w8, w8, #1
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i16 %a, 100
+  ret i16 %s
+}
+
+define i32 @si32_7(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: si32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w9, w8, #2
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w0, w8
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i32 %a, 7
+  ret i32 %s
+}
+
+define i32 @si32_100(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: si32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w9, w0, w8
+; CHECK-GI-NEXT:    msub w0, w9, w8, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i32 %a, 100
+  ret i32 %s
+}
+
+define i32 @ui32_7(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: ui32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk w8, #9362, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT:    lsr w8, w8, #2
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i32 %a, 7
+  ret i32 %s
+}
+
+define i32 @ui32_100(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: ui32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    lsr w8, w8, #5
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i32 %a, 100
+  ret i32 %s
+}
+
+define i64 @si64_7(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: si64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x8, x0, x8
+; CHECK-SD-NEXT:    asr x9, x8, #1
+; CHECK-SD-NEXT:    add x8, x9, x8, lsr #63
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv x8, x0, x8
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i64 %a, 7
+  ret i64 %s
+}
+
+define i64 @si64_100(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: si64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x8, x0, x8
+; CHECK-SD-NEXT:    add x8, x8, x0
+; CHECK-SD-NEXT:    asr x9, x8, #6
+; CHECK-SD-NEXT:    add x8, x9, x8, lsr #63
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub x0, x8, x9, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv x9, x0, x8
+; CHECK-GI-NEXT:    msub x0, x9, x8, x0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i64 %a, 100
+  ret i64 %s
+}
+
+define i64 @ui64_7(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: ui64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x8, x0, x8
+; CHECK-SD-NEXT:    sub x9, x0, x8
+; CHECK-SD-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x8, x0, x8
+; CHECK-GI-NEXT:    sub x9, x0, x8
+; CHECK-GI-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i64 %a, 7
+  ret i64 %s
+}
+
+define i64 @ui64_100(i64 %a, i64 %b) {
+; CHECK-LABEL: ui64_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x9, #62915 // =0xf5c3
+; CHECK-NEXT:    lsr x8, x0, #2
+; CHECK-NEXT:    movk x9, #23592, lsl #16
+; CHECK-NEXT:    movk x9, #49807, lsl #32
+; CHECK-NEXT:    movk x9, #10485, lsl #48
+; CHECK-NEXT:    umulh x8, x8, x9
+; CHECK-NEXT:    mov w9, #100 // =0x64
+; CHECK-NEXT:    lsr x8, x8, #2
+; CHECK-NEXT:    msub x0, x8, x9, x0
+; CHECK-NEXT:    ret
+entry:
+  %s = urem i64 %a, 100
+  ret i64 %s
+}
+
+define i128 @si128_7(i128 %a, i128 %b) {
+; CHECK-LABEL: si128_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov w2, #7 // =0x7
+; CHECK-NEXT:    mov x3, xzr
+; CHECK-NEXT:    bl __modti3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %s = srem i128 %a, 7
+  ret i128 %s
+}
+
+define i128 @si128_100(i128 %a, i128 %b) {
+; CHECK-LABEL: si128_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov w2, #100 // =0x64
+; CHECK-NEXT:    mov x3, xzr
+; CHECK-NEXT:    bl __modti3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %s = srem i128 %a, 100
+  ret i128 %s
+}
+
+define i128 @ui128_7(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: ui128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x10, #9362 // =0x2492
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x9, x1, x8
+; CHECK-GI-NEXT:    mul x11, x0, x10
+; CHECK-GI-NEXT:    umulh x12, x0, x8
+; CHECK-GI-NEXT:    mul x13, x1, x10
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    umulh x14, x1, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    cmn x9, x12
+; CHECK-GI-NEXT:    and x9, x11, #0x1
+; CHECK-GI-NEXT:    sub x12, x0, x0
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x8
+; CHECK-GI-NEXT:    add x9, x9, x11
+; CHECK-GI-NEXT:    and x11, xzr, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x14
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    umulh x10, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    adds x12, x12, x15
+; CHECK-GI-NEXT:    and x13, x13, #0x1
+; CHECK-GI-NEXT:    umulh x14, x0, xzr
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    adds x9, x12, x9
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    and x12, x15, #0x1
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, x13, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    add x10, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    subs x10, x0, x9
+; CHECK-GI-NEXT:    sbc x11, x1, x8
+; CHECK-GI-NEXT:    lsl x12, x11, #63
+; CHECK-GI-NEXT:    lsr x11, x11, #1
+; CHECK-GI-NEXT:    orr x10, x12, x10, lsr #1
+; CHECK-GI-NEXT:    adds x9, x10, x9
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsl x10, x8, #62
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #2
+; CHECK-GI-NEXT:    mov w10, #7 // =0x7
+; CHECK-GI-NEXT:    lsl x12, x8, #3
+; CHECK-GI-NEXT:    umulh x10, x9, x10
+; CHECK-GI-NEXT:    lsl x11, x9, #3
+; CHECK-GI-NEXT:    sub x8, x12, x8
+; CHECK-GI-NEXT:    sub x9, x11, x9
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    sbc x1, x1, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i128 %a, 7
+  ret i128 %s
+}
+
+define i128 @ui128_100(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: ui128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x10, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x10, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #32
+; CHECK-GI-NEXT:    movk x8, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #48
+; CHECK-GI-NEXT:    mul x9, x1, x8
+; CHECK-GI-NEXT:    mul x11, x0, x10
+; CHECK-GI-NEXT:    umulh x12, x0, x8
+; CHECK-GI-NEXT:    mul x13, x1, x10
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    umulh x14, x1, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    cmn x9, x12
+; CHECK-GI-NEXT:    and x9, x11, #0x1
+; CHECK-GI-NEXT:    sub x12, x0, x0
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x8
+; CHECK-GI-NEXT:    add x9, x9, x11
+; CHECK-GI-NEXT:    and x11, xzr, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x14
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    umulh x10, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    adds x12, x12, x15
+; CHECK-GI-NEXT:    and x13, x13, #0x1
+; CHECK-GI-NEXT:    umulh x14, x0, xzr
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    adds x9, x12, x9
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    and x12, x15, #0x1
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, x13, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    add x10, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    lsl x10, x8, #60
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #4
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    umulh x11, x9, x10
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    madd x8, x8, x10, x11
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    sbc x1, x1, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i128 %a, 100
+  ret i128 %s
+}
+
+define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: sv2i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v1.2s, #24
+; CHECK-SD-NEXT:    smull v2.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #32
+; CHECK-SD-NEXT:    ssra v2.2s, v1.2s, #24
+; CHECK-SD-NEXT:    sshr v1.2s, v2.2s, #2
+; CHECK-SD-NEXT:    usra v1.2s, v2.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    smov w11, v1.h[1]
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i8> %d, <i8 7, i8 7>
+  ret <2 x i8> %s
+}
+
+define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: sv2i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    smov w11, v1.h[1]
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i8> %d, <i8 100, i8 100>
+  ret <2 x i8> %s
+}
+
+define <3 x i8> @sv3i8_7(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: sv3i8_7:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    sxtb x8, w0
+; CHECK-SD-NEXT:    mov x9, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    sxtb x10, w1
+; CHECK-SD-NEXT:    sxtb x11, w2
+; CHECK-SD-NEXT:    movk x9, #37449, lsl #16
+; CHECK-SD-NEXT:    sxtb w12, w1
+; CHECK-SD-NEXT:    smull x8, w8, w9
+; CHECK-SD-NEXT:    sxtb w13, w0
+; CHECK-SD-NEXT:    smull x10, w10, w9
+; CHECK-SD-NEXT:    smull x9, w11, w9
+; CHECK-SD-NEXT:    sxtb w11, w2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    add w8, w8, w13
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    asr w14, w8, #2
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    asr w15, w10, #2
+; CHECK-SD-NEXT:    asr w16, w9, #2
+; CHECK-SD-NEXT:    add w8, w14, w8, lsr #31
+; CHECK-SD-NEXT:    add w10, w15, w10, lsr #31
+; CHECK-SD-NEXT:    add w9, w16, w9, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    add w0, w13, w8
+; CHECK-SD-NEXT:    add w1, w12, w10
+; CHECK-SD-NEXT:    add w2, w11, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8_7:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w13, w2
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w12, w11, w9
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w0, w8, w10
+; CHECK-GI-NEXT:    sdiv w9, w13, w9
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w12, w15, w12
+; CHECK-GI-NEXT:    sub w1, w11, w12
+; CHECK-GI-NEXT:    lsl w16, w9, #3
+; CHECK-GI-NEXT:    sub w9, w16, w9
+; CHECK-GI-NEXT:    sub w2, w13, w9
+; CHECK-GI-NEXT:    ret
+  %s = srem <3 x i8> %d, <i8 7, i8 7, i8 7>
+  ret <3 x i8> %s
+}
+
+define <3 x i8> @sv3i8_100(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: sv3i8_100:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    sxtb x8, w0
+; CHECK-SD-NEXT:    mov w9, #34079 // =0x851f
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    sxtb x10, w1
+; CHECK-SD-NEXT:    movk w9, #20971, lsl #16
+; CHECK-SD-NEXT:    sxtb x11, w2
+; CHECK-SD-NEXT:    sxtb w12, w0
+; CHECK-SD-NEXT:    smull x8, w8, w9
+; CHECK-SD-NEXT:    smull x10, w10, w9
+; CHECK-SD-NEXT:    smull x9, w11, w9
+; CHECK-SD-NEXT:    mov w11, #100 // =0x64
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    msub w0, w8, w11, w12
+; CHECK-SD-NEXT:    sxtb w8, w1
+; CHECK-SD-NEXT:    msub w1, w10, w11, w8
+; CHECK-SD-NEXT:    sxtb w8, w2
+; CHECK-SD-NEXT:    msub w2, w9, w11, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8_100:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w13, w2
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w12, w11, w9
+; CHECK-GI-NEXT:    msub w0, w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w14, w13, w9
+; CHECK-GI-NEXT:    msub w1, w12, w9, w11
+; CHECK-GI-NEXT:    msub w2, w14, w9, w13
+; CHECK-GI-NEXT:    ret
+  %s = srem <3 x i8> %d, <i8 100, i8 100, i8 100>
+  ret <3 x i8> %s
+}
+
+define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: sv4i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    smov w11, v0.h[0]
+; CHECK-SD-NEXT:    smov x12, v0.h[2]
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smov x14, v0.h[3]
+; CHECK-SD-NEXT:    smov w16, v0.h[2]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    smull x8, w14, w8
+; CHECK-SD-NEXT:    smov w14, v0.h[3]
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    asr w15, w9, #2
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    asr w17, w10, #2
+; CHECK-SD-NEXT:    add w12, w12, w16
+; CHECK-SD-NEXT:    add w9, w15, w9, lsr #31
+; CHECK-SD-NEXT:    asr w15, w12, #2
+; CHECK-SD-NEXT:    add w8, w8, w14
+; CHECK-SD-NEXT:    add w10, w17, w10, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    add w9, w11, w9
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w13, w10
+; CHECK-SD-NEXT:    add w9, w15, w12, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    asr w10, w8, #2
+; CHECK-SD-NEXT:    add w9, w16, w9
+; CHECK-SD-NEXT:    add w8, w10, w8, lsr #31
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w8, w14, w8
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v3.4h, #7
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w9, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
+  ret <4 x i8> %s
+}
+
+define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: sv4i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    sshr v1.4h, v0.4h, #8
+; CHECK-SD-NEXT:    smov x9, v1.h[0]
+; CHECK-SD-NEXT:    smov x10, v1.h[1]
+; CHECK-SD-NEXT:    smov x11, v1.h[2]
+; CHECK-SD-NEXT:    smov w12, v1.h[0]
+; CHECK-SD-NEXT:    smov x13, v1.h[3]
+; CHECK-SD-NEXT:    smov w15, v1.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    smull x8, w13, w8
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    msub w9, w9, w14, w12
+; CHECK-SD-NEXT:    msub w10, w10, w14, w15
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w9, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w11, v1.h[2]
+; CHECK-SD-NEXT:    msub w9, w9, w14, w11
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    smov w10, v1.h[3]
+; CHECK-SD-NEXT:    msub w8, w8, w14, w10
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v3.4h, #100
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w9, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
+  ret <4 x i8> %s
+}
+
+define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: sv8i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #147
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    add v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    sshr v1.8b, v1.8b, #2
+; CHECK-SD-NEXT:    usra v1.8b, v1.8b, #7
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v4.8b, #7
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <8 x i8> %s
+}
+
+define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: sv8i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #41
+; CHECK-SD-NEXT:    movi v2.8b, #100
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    sshr v1.8b, v1.8b, #4
+; CHECK-SD-NEXT:    usra v1.8b, v1.8b, #7
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v4.8b, #100
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-SD-LABEL: sv16i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.16b, #147
+; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    movi v2.16b, #7
+; CHECK-SD-NEXT:    add v1.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    sshr v1.16b, v1.16b, #2
+; CHECK-SD-NEXT:    usra v1.16b, v1.16b, #7
+; CHECK-SD-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v16.8b, #7
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    fmov w17, s0
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w18, v0.s[1]
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    mov w1, v0.s[3]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[3]
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-SD-LABEL: sv16i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.16b, #41
+; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    movi v2.16b, #100
+; CHECK-SD-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-SD-NEXT:    usra v1.16b, v1.16b, #7
+; CHECK-SD-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v16.8b, #100
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    fmov w17, s0
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w18, v0.s[1]
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    mov w1, v0.s[3]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[3]
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <16 x i8> %s
+}
+
+define <32 x i8> @sv32i8_7(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: sv32i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -48
+; CHECK-SD-NEXT:    smov x10, v0.b[0]
+; CHECK-SD-NEXT:    smov x9, v0.b[1]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov w17, v0.b[0]
+; CHECK-SD-NEXT:    smov w15, v0.b[1]
+; CHECK-SD-NEXT:    smov x11, v0.b[2]
+; CHECK-SD-NEXT:    smov x13, v0.b[3]
+; CHECK-SD-NEXT:    smov x18, v0.b[4]
+; CHECK-SD-NEXT:    smov w14, v0.b[2]
+; CHECK-SD-NEXT:    smov x1, v0.b[5]
+; CHECK-SD-NEXT:    smov w16, v0.b[3]
+; CHECK-SD-NEXT:    smull x2, w10, w8
+; CHECK-SD-NEXT:    smov x3, v0.b[6]
+; CHECK-SD-NEXT:    smov w12, v0.b[4]
+; CHECK-SD-NEXT:    smull x0, w9, w8
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    smov x5, v0.b[7]
+; CHECK-SD-NEXT:    smull x4, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.b[5]
+; CHECK-SD-NEXT:    smov w10, v0.b[6]
+; CHECK-SD-NEXT:    lsr x2, x2, #32
+; CHECK-SD-NEXT:    smull x13, w13, w8
+; CHECK-SD-NEXT:    smov w9, v0.b[7]
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    smov x6, v0.b[8]
+; CHECK-SD-NEXT:    add w2, w2, w17
+; CHECK-SD-NEXT:    lsr x4, x4, #32
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    add w0, w0, w15
+; CHECK-SD-NEXT:    asr w19, w2, #2
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    asr w7, w0, #2
+; CHECK-SD-NEXT:    add w4, w4, w14
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    add w2, w19, w2, lsr #31
+; CHECK-SD-NEXT:    smull x3, w3, w8
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    add w0, w7, w0, lsr #31
+; CHECK-SD-NEXT:    asr w7, w4, #2
+; CHECK-SD-NEXT:    add w13, w13, w16
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    add w18, w18, w12
+; CHECK-SD-NEXT:    asr w19, w13, #2
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w4, w7, w4, lsr #31
+; CHECK-SD-NEXT:    asr w20, w18, #2
+; CHECK-SD-NEXT:    add w17, w17, w2
+; CHECK-SD-NEXT:    smull x5, w5, w8
+; CHECK-SD-NEXT:    add w1, w1, w11
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    smov x0, v0.b[9]
+; CHECK-SD-NEXT:    fmov s0, w17
+; CHECK-SD-NEXT:    sub w4, w4, w4, lsl #3
+; CHECK-SD-NEXT:    add w7, w19, w13, lsr #31
+; CHECK-SD-NEXT:    add w13, w20, w18, lsr #31
+; CHECK-SD-NEXT:    lsr x18, x3, #32
+; CHECK-SD-NEXT:    asr w2, w1, #2
+; CHECK-SD-NEXT:    smull x3, w6, w8
+; CHECK-SD-NEXT:    mov v0.b[1], w15
+; CHECK-SD-NEXT:    smov x6, v2.b[10]
+; CHECK-SD-NEXT:    add w14, w14, w4
+; CHECK-SD-NEXT:    lsr x17, x5, #32
+; CHECK-SD-NEXT:    add w5, w18, w10
+; CHECK-SD-NEXT:    add w15, w2, w1, lsr #31
+; CHECK-SD-NEXT:    smull x1, w0, w8
+; CHECK-SD-NEXT:    sub w7, w7, w7, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    smov w18, v2.b[8]
+; CHECK-SD-NEXT:    smov w0, v2.b[9]
+; CHECK-SD-NEXT:    add w17, w17, w9
+; CHECK-SD-NEXT:    mov v0.b[2], w14
+; CHECK-SD-NEXT:    asr w14, w5, #2
+; CHECK-SD-NEXT:    add w12, w12, w13
+; CHECK-SD-NEXT:    lsr x19, x1, #32
+; CHECK-SD-NEXT:    lsr x3, x3, #32
+; CHECK-SD-NEXT:    smov x13, v1.b[0]
+; CHECK-SD-NEXT:    add w1, w14, w5, lsr #31
+; CHECK-SD-NEXT:    smull x5, w6, w8
+; CHECK-SD-NEXT:    add w6, w16, w7
+; CHECK-SD-NEXT:    smov x7, v1.b[1]
+; CHECK-SD-NEXT:    smov w14, v2.b[10]
+; CHECK-SD-NEXT:    asr w2, w17, #2
+; CHECK-SD-NEXT:    mov v0.b[3], w6
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w3, w3, w18
+; CHECK-SD-NEXT:    lsr x5, x5, #32
+; CHECK-SD-NEXT:    asr w4, w3, #2
+; CHECK-SD-NEXT:    add w16, w19, w0
+; CHECK-SD-NEXT:    smov x19, v2.b[11]
+; CHECK-SD-NEXT:    add w20, w2, w17, lsr #31
+; CHECK-SD-NEXT:    add w11, w11, w15
+; CHECK-SD-NEXT:    add w2, w5, w14
+; CHECK-SD-NEXT:    smov w5, v1.b[1]
+; CHECK-SD-NEXT:    smull x15, w13, w8
+; CHECK-SD-NEXT:    mov v0.b[4], w12
+; CHECK-SD-NEXT:    smull x12, w7, w8
+; CHECK-SD-NEXT:    add w21, w4, w3, lsr #31
+; CHECK-SD-NEXT:    sub w7, w1, w1, lsl #3
+; CHECK-SD-NEXT:    smov x22, v1.b[2]
+; CHECK-SD-NEXT:    asr w6, w16, #2
+; CHECK-SD-NEXT:    smull x4, w19, w8
+; CHECK-SD-NEXT:    sub w19, w20, w20, lsl #3
+; CHECK-SD-NEXT:    sub w20, w21, w21, lsl #3
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    lsr x21, x15, #32
+; CHECK-SD-NEXT:    add w15, w10, w7
+; CHECK-SD-NEXT:    mov v0.b[5], w11
+; CHECK-SD-NEXT:    smov w11, v1.b[0]
+; CHECK-SD-NEXT:    add w6, w6, w16, lsr #31
+; CHECK-SD-NEXT:    add w12, w12, w5
+; CHECK-SD-NEXT:    add w9, w9, w19
+; CHECK-SD-NEXT:    smull x19, w22, w8
+; CHECK-SD-NEXT:    asr w7, w12, #2
+; CHECK-SD-NEXT:    smov x22, v1.b[3]
+; CHECK-SD-NEXT:    sub w23, w6, w6, lsl #3
+; CHECK-SD-NEXT:    add w20, w18, w20
+; CHECK-SD-NEXT:    smov w6, v1.b[2]
+; CHECK-SD-NEXT:    smov w17, v2.b[11]
+; CHECK-SD-NEXT:    mov v0.b[6], w15
+; CHECK-SD-NEXT:    add w21, w21, w11
+; CHECK-SD-NEXT:    add w7, w7, w12, lsr #31
+; CHECK-SD-NEXT:    asr w12, w21, #2
+; CHECK-SD-NEXT:    lsr x19, x19, #32
+; CHECK-SD-NEXT:    smov x1, v2.b[12]
+; CHECK-SD-NEXT:    sub w7, w7, w7, lsl #3
+; CHECK-SD-NEXT:    smov w13, v2.b[12]
+; CHECK-SD-NEXT:    smov x16, v2.b[13]
+; CHECK-SD-NEXT:    add w18, w12, w21, lsr #31
+; CHECK-SD-NEXT:    smov w10, v2.b[13]
+; CHECK-SD-NEXT:    smov x15, v2.b[14]
+; CHECK-SD-NEXT:    mov v0.b[7], w9
+; CHECK-SD-NEXT:    add w5, w5, w7
+; CHECK-SD-NEXT:    add w7, w19, w6
+; CHECK-SD-NEXT:    sub w21, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w18, w0, w23
+; CHECK-SD-NEXT:    smull x0, w22, w8
+; CHECK-SD-NEXT:    smov x22, v1.b[4]
+; CHECK-SD-NEXT:    smov w19, v1.b[3]
+; CHECK-SD-NEXT:    smov w9, v2.b[14]
+; CHECK-SD-NEXT:    smov x12, v2.b[15]
+; CHECK-SD-NEXT:    asr w3, w2, #2
+; CHECK-SD-NEXT:    lsr x4, x4, #32
+; CHECK-SD-NEXT:    mov v0.b[8], w20
+; CHECK-SD-NEXT:    add w20, w11, w21
+; CHECK-SD-NEXT:    asr w21, w7, #2
+; CHECK-SD-NEXT:    smov w11, v2.b[15]
+; CHECK-SD-NEXT:    fmov s2, w20
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    add w7, w21, w7, lsr #31
+; CHECK-SD-NEXT:    smull x20, w22, w8
+; CHECK-SD-NEXT:    smov x21, v1.b[5]
+; CHECK-SD-NEXT:    add w0, w0, w19
+; CHECK-SD-NEXT:    add w2, w3, w2, lsr #31
+; CHECK-SD-NEXT:    smov x22, v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[1], w5
+; CHECK-SD-NEXT:    smov w5, v1.b[4]
+; CHECK-SD-NEXT:    sub w3, w7, w7, lsl #3
+; CHECK-SD-NEXT:    asr w7, w0, #2
+; CHECK-SD-NEXT:    lsr x20, x20, #32
+; CHECK-SD-NEXT:    add w4, w4, w17
+; CHECK-SD-NEXT:    add w3, w6, w3
+; CHECK-SD-NEXT:    smull x6, w21, w8
+; CHECK-SD-NEXT:    asr w21, w4, #2
+; CHECK-SD-NEXT:    add w0, w7, w0, lsr #31
+; CHECK-SD-NEXT:    smov w7, v1.b[5]
+; CHECK-SD-NEXT:    smull x22, w22, w8
+; CHECK-SD-NEXT:    mov v2.b[2], w3
+; CHECK-SD-NEXT:    add w3, w20, w5
+; CHECK-SD-NEXT:    add w4, w21, w4, lsr #31
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    asr w20, w3, #2
+; CHECK-SD-NEXT:    lsr x6, x6, #32
+; CHECK-SD-NEXT:    lsr x22, x22, #32
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    smov x21, v1.b[8]
+; CHECK-SD-NEXT:    add w19, w19, w0
+; CHECK-SD-NEXT:    add w3, w20, w3, lsr #31
+; CHECK-SD-NEXT:    smov w0, v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[3], w19
+; CHECK-SD-NEXT:    smov x19, v1.b[7]
+; CHECK-SD-NEXT:    add w6, w6, w7
+; CHECK-SD-NEXT:    sub w3, w3, w3, lsl #3
+; CHECK-SD-NEXT:    asr w20, w6, #2
+; CHECK-SD-NEXT:    mov v0.b[9], w18
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    add w14, w14, w2
+; CHECK-SD-NEXT:    sub w2, w4, w4, lsl #3
+; CHECK-SD-NEXT:    add w3, w5, w3
+; CHECK-SD-NEXT:    add w5, w20, w6, lsr #31
+; CHECK-SD-NEXT:    add w6, w22, w0
+; CHECK-SD-NEXT:    smull x19, w19, w8
+; CHECK-SD-NEXT:    mov v2.b[4], w3
+; CHECK-SD-NEXT:    smov w20, v1.b[7]
+; CHECK-SD-NEXT:    asr w3, w6, #2
+; CHECK-SD-NEXT:    sub w5, w5, w5, lsl #3
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    mov v0.b[10], w14
+; CHECK-SD-NEXT:    smov w14, v1.b[8]
+; CHECK-SD-NEXT:    add w17, w17, w2
+; CHECK-SD-NEXT:    lsr x4, x19, #32
+; CHECK-SD-NEXT:    add w18, w3, w6, lsr #31
+; CHECK-SD-NEXT:    add w3, w7, w5
+; CHECK-SD-NEXT:    mov v2.b[5], w3
+; CHECK-SD-NEXT:    smov x5, v1.b[9]
+; CHECK-SD-NEXT:    add w1, w1, w13
+; CHECK-SD-NEXT:    add w3, w4, w20
+; CHECK-SD-NEXT:    smull x4, w21, w8
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    asr w6, w3, #2
+; CHECK-SD-NEXT:    mov v0.b[11], w17
+; CHECK-SD-NEXT:    smull x16, w16, w8
+; CHECK-SD-NEXT:    add w18, w0, w18
+; CHECK-SD-NEXT:    asr w0, w1, #2
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    add w2, w6, w3, lsr #31
+; CHECK-SD-NEXT:    lsr x3, x4, #32
+; CHECK-SD-NEXT:    mov v2.b[6], w18
+; CHECK-SD-NEXT:    smull x18, w5, w8
+; CHECK-SD-NEXT:    smov x4, v1.b[10]
+; CHECK-SD-NEXT:    smov w5, v1.b[9]
+; CHECK-SD-NEXT:    add w3, w3, w14
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    add w17, w0, w1, lsr #31
+; CHECK-SD-NEXT:    asr w0, w3, #2
+; CHECK-SD-NEXT:    smov x6, v1.b[12]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    add w1, w20, w2
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w3, lsr #31
+; CHECK-SD-NEXT:    smull x2, w4, w8
+; CHECK-SD-NEXT:    smov x3, v1.b[11]
+; CHECK-SD-NEXT:    mov v2.b[7], w1
+; CHECK-SD-NEXT:    add w18, w18, w5
+; CHECK-SD-NEXT:    smov w1, v1.b[10]
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    asr w4, w18, #2
+; CHECK-SD-NEXT:    smull x6, w6, w8
+; CHECK-SD-NEXT:    lsr x2, x2, #32
+; CHECK-SD-NEXT:    add w16, w16, w10
+; CHECK-SD-NEXT:    add w13, w13, w17
+; CHECK-SD-NEXT:    add w14, w14, w0
+; CHECK-SD-NEXT:    add w18, w4, w18, lsr #31
+; CHECK-SD-NEXT:    smull x0, w3, w8
+; CHECK-SD-NEXT:    mov v2.b[8], w14
+; CHECK-SD-NEXT:    add w14, w2, w1
+; CHECK-SD-NEXT:    smov w2, v1.b[11]
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    asr w3, w14, #2
+; CHECK-SD-NEXT:    asr w4, w16, #2
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    lsr x6, x6, #32
+; CHECK-SD-NEXT:    mov v0.b[12], w13
+; CHECK-SD-NEXT:    add w18, w5, w18
+; CHECK-SD-NEXT:    add w14, w3, w14, lsr #31
+; CHECK-SD-NEXT:    smov w3, v1.b[12]
+; CHECK-SD-NEXT:    mov v2.b[9], w18
+; CHECK-SD-NEXT:    add w18, w0, w2
+; CHECK-SD-NEXT:    smov x0, v1.b[13]
+; CHECK-SD-NEXT:    asr w5, w18, #2
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    add w16, w4, w16, lsr #31
+; CHECK-SD-NEXT:    smov x4, v1.b[14]
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    add w17, w5, w18, lsr #31
+; CHECK-SD-NEXT:    add w14, w1, w14
+; CHECK-SD-NEXT:    add w18, w6, w3
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    asr w1, w18, #2
+; CHECK-SD-NEXT:    mov v2.b[10], w14
+; CHECK-SD-NEXT:    sub w14, w17, w17, lsl #3
+; CHECK-SD-NEXT:    smov w17, v1.b[13]
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    add w13, w1, w18, lsr #31
+; CHECK-SD-NEXT:    smov x1, v1.b[15]
+; CHECK-SD-NEXT:    add w15, w15, w9
+; CHECK-SD-NEXT:    lsr x18, x0, #32
+; CHECK-SD-NEXT:    add w14, w2, w14
+; CHECK-SD-NEXT:    smull x0, w4, w8
+; CHECK-SD-NEXT:    mov v2.b[11], w14
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    add w10, w10, w16
+; CHECK-SD-NEXT:    add w14, w18, w17
+; CHECK-SD-NEXT:    smov w18, v1.b[14]
+; CHECK-SD-NEXT:    mov v0.b[13], w10
+; CHECK-SD-NEXT:    asr w16, w14, #2
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    add w13, w3, w13
+; CHECK-SD-NEXT:    smull x8, w1, w8
+; CHECK-SD-NEXT:    lsr x10, x12, #32
+; CHECK-SD-NEXT:    add w14, w16, w14, lsr #31
+; CHECK-SD-NEXT:    asr w16, w15, #2
+; CHECK-SD-NEXT:    mov v2.b[12], w13
+; CHECK-SD-NEXT:    add w13, w0, w18
+; CHECK-SD-NEXT:    smov w0, v1.b[15]
+; CHECK-SD-NEXT:    add w10, w10, w11
+; CHECK-SD-NEXT:    add w15, w16, w15, lsr #31
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    asr w16, w13, #2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add w12, w17, w14
+; CHECK-SD-NEXT:    add w13, w16, w13, lsr #31
+; CHECK-SD-NEXT:    sub w14, w15, w15, lsl #3
+; CHECK-SD-NEXT:    mov v2.b[13], w12
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w12, w10, #2
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    asr w15, w8, #2
+; CHECK-SD-NEXT:    add w9, w9, w14
+; CHECK-SD-NEXT:    add w10, w12, w10, lsr #31
+; CHECK-SD-NEXT:    mov v0.b[14], w9
+; CHECK-SD-NEXT:    add w12, w18, w13
+; CHECK-SD-NEXT:    add w8, w15, w8, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[14], w12
+; CHECK-SD-NEXT:    sub w9, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add w9, w11, w9
+; CHECK-SD-NEXT:    add w8, w0, w8
+; CHECK-SD-NEXT:    mov v0.b[15], w9
+; CHECK-SD-NEXT:    mov v2.b[15], w8
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv32i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -88
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshll v4.8h, v1.8b, #0
+; CHECK-GI-NEXT:    movi v20.8b, #7
+; CHECK-GI-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v20.8h, v20.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w4, s5
+; CHECK-GI-NEXT:    fmov w19, s4
+; CHECK-GI-NEXT:    sshll v23.4s, v20.4h, #0
+; CHECK-GI-NEXT:    sshll2 v20.4s, v20.8h, #0
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[1]
+; CHECK-GI-NEXT:    sdiv w17, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[1]
+; CHECK-GI-NEXT:    fmov s6, w12
+; CHECK-GI-NEXT:    sdiv w13, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[2]
+; CHECK-GI-NEXT:    fmov s7, w17
+; CHECK-GI-NEXT:    sdiv w18, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[2]
+; CHECK-GI-NEXT:    mov v6.s[1], w13
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov v7.s[1], w18
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v24.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sdiv w16, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v6.s[2], w11
+; CHECK-GI-NEXT:    fmov w14, s2
+; CHECK-GI-NEXT:    sdiv w23, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[1]
+; CHECK-GI-NEXT:    mov v7.s[2], w16
+; CHECK-GI-NEXT:    sdiv w20, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[1]
+; CHECK-GI-NEXT:    fmov s19, w23
+; CHECK-GI-NEXT:    sdiv w24, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[2]
+; CHECK-GI-NEXT:    fmov s18, w20
+; CHECK-GI-NEXT:    sdiv w2, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov v19.s[1], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w21, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[2]
+; CHECK-GI-NEXT:    fmov s16, w2
+; CHECK-GI-NEXT:    sdiv w22, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[3]
+; CHECK-GI-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-GI-NEXT:    mov v18.s[1], w21
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v25.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sdiv w1, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[2]
+; CHECK-GI-NEXT:    mov v19.s[2], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w7, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NEXT:    fmov w25, s5
+; CHECK-GI-NEXT:    mov w26, v5.s[1]
+; CHECK-GI-NEXT:    mov w27, v5.s[2]
+; CHECK-GI-NEXT:    mov w28, v5.s[3]
+; CHECK-GI-NEXT:    sshll2 v5.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    fmov w29, s5
+; CHECK-GI-NEXT:    mov w30, v5.s[1]
+; CHECK-GI-NEXT:    mov w11, v5.s[3]
+; CHECK-GI-NEXT:    sdiv w15, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v3.8h, #0
+; CHECK-GI-NEXT:    mov v18.s[2], w7
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    fmov w0, s2
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v16.s[2], w15
+; CHECK-GI-NEXT:    sdiv w6, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[1]
+; CHECK-GI-NEXT:    str w9, [sp, #12] // 4-byte Folded Spill
+; CHECK-GI-NEXT:    mov w9, v5.s[2]
+; CHECK-GI-NEXT:    sdiv w25, w25, w8
+; CHECK-GI-NEXT:    fmov s17, w6
+; CHECK-GI-NEXT:    sdiv w29, w29, w8
+; CHECK-GI-NEXT:    fmov s21, w25
+; CHECK-GI-NEXT:    sdiv w5, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[2]
+; CHECK-GI-NEXT:    fmov s22, w29
+; CHECK-GI-NEXT:    sdiv w26, w26, w8
+; CHECK-GI-NEXT:    mov v17.s[1], w5
+; CHECK-GI-NEXT:    sdiv w30, w30, w8
+; CHECK-GI-NEXT:    mov v21.s[1], w26
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w3, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[3]
+; CHECK-GI-NEXT:    mov v22.s[1], w30
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w27, w27, w8
+; CHECK-GI-NEXT:    mov v17.s[2], w3
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v21.s[2], w27
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    mov v22.s[2], w9
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v7.s[3], w10
+; CHECK-GI-NEXT:    mls v0.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v16.s[3], w14
+; CHECK-GI-NEXT:    mls v3.4s, v16.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v17.s[3], w0
+; CHECK-GI-NEXT:    mls v2.4s, v17.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w19, w19, w8
+; CHECK-GI-NEXT:    mov v18.s[3], w4
+; CHECK-GI-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-GI-NEXT:    mls v25.4s, v18.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w28, w28, w8
+; CHECK-GI-NEXT:    mov v19.s[3], w19
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mls v1.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w8, w11, w8
+; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; CHECK-GI-NEXT:    mov v21.s[3], w28
+; CHECK-GI-NEXT:    uzp1 v1.8h, v25.8h, v1.8h
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v6.s[3], w11
+; CHECK-GI-NEXT:    mls v4.4s, v21.4s, v23.4s
+; CHECK-GI-NEXT:    mls v24.4s, v6.4s, v23.4s
+; CHECK-GI-NEXT:    mov v22.s[3], w8
+; CHECK-GI-NEXT:    uzp1 v0.8h, v24.8h, v0.8h
+; CHECK-GI-NEXT:    mls v5.4s, v22.4s, v20.4s
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <32 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <32 x i8> %s
+}
+
+define <32 x i8> @sv32i8_100(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: sv32i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smov x10, v0.b[1]
+; CHECK-SD-NEXT:    smov x11, v0.b[0]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x12, v0.b[2]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x14, v0.b[3]
+; CHECK-SD-NEXT:    smov x17, v0.b[5]
+; CHECK-SD-NEXT:    smov x15, v0.b[4]
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    smov w2, v0.b[1]
+; CHECK-SD-NEXT:    smov w13, v0.b[2]
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x16, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.b[0]
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    asr x0, x10, #37
+; CHECK-SD-NEXT:    smull x18, w14, w8
+; CHECK-SD-NEXT:    smov w14, v0.b[3]
+; CHECK-SD-NEXT:    asr x1, x16, #37
+; CHECK-SD-NEXT:    smull x16, w17, w8
+; CHECK-SD-NEXT:    smov w10, v0.b[4]
+; CHECK-SD-NEXT:    asr x12, x12, #37
+; CHECK-SD-NEXT:    add w3, w0, w0, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    add w17, w1, w1, lsr #31
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    smov x0, v0.b[6]
+; CHECK-SD-NEXT:    add w1, w12, w12, lsr #31
+; CHECK-SD-NEXT:    msub w12, w3, w9, w2
+; CHECK-SD-NEXT:    smov x2, v1.b[0]
+; CHECK-SD-NEXT:    msub w11, w17, w9, w11
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    smov x3, v1.b[1]
+; CHECK-SD-NEXT:    asr x15, x15, #37
+; CHECK-SD-NEXT:    asr x16, x16, #37
+; CHECK-SD-NEXT:    msub w1, w1, w9, w13
+; CHECK-SD-NEXT:    msub w13, w18, w9, w14
+; CHECK-SD-NEXT:    smov w17, v0.b[5]
+; CHECK-SD-NEXT:    smov x14, v0.b[7]
+; CHECK-SD-NEXT:    smull x18, w0, w8
+; CHECK-SD-NEXT:    fmov s2, w11
+; CHECK-SD-NEXT:    add w15, w15, w15, lsr #31
+; CHECK-SD-NEXT:    smull x0, w2, w8
+; CHECK-SD-NEXT:    add w16, w16, w16, lsr #31
+; CHECK-SD-NEXT:    smov x11, v1.b[2]
+; CHECK-SD-NEXT:    smull x2, w3, w8
+; CHECK-SD-NEXT:    smov x3, v0.b[8]
+; CHECK-SD-NEXT:    mov v2.b[1], w12
+; CHECK-SD-NEXT:    msub w15, w15, w9, w10
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    msub w10, w16, w9, w17
+; CHECK-SD-NEXT:    asr x17, x0, #37
+; CHECK-SD-NEXT:    smov w16, v1.b[0]
+; CHECK-SD-NEXT:    asr x0, x2, #37
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smov w12, v0.b[6]
+; CHECK-SD-NEXT:    add w17, w17, w17, lsr #31
+; CHECK-SD-NEXT:    smull x14, w14, w8
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[2], w1
+; CHECK-SD-NEXT:    smov x1, v1.b[3]
+; CHECK-SD-NEXT:    add w0, w0, w0, lsr #31
+; CHECK-SD-NEXT:    msub w16, w17, w9, w16
+; CHECK-SD-NEXT:    smov w17, v1.b[1]
+; CHECK-SD-NEXT:    asr x2, x11, #37
+; CHECK-SD-NEXT:    msub w11, w18, w9, w12
+; CHECK-SD-NEXT:    asr x12, x14, #37
+; CHECK-SD-NEXT:    smov x18, v1.b[4]
+; CHECK-SD-NEXT:    msub w14, w0, w9, w17
+; CHECK-SD-NEXT:    add w17, w2, w2, lsr #31
+; CHECK-SD-NEXT:    fmov s3, w16
+; CHECK-SD-NEXT:    smull x0, w1, w8
+; CHECK-SD-NEXT:    smov w16, v1.b[2]
+; CHECK-SD-NEXT:    mov v2.b[3], w13
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    smull x13, w3, w8
+; CHECK-SD-NEXT:    msub w16, w17, w9, w16
+; CHECK-SD-NEXT:    mov v3.b[1], w14
+; CHECK-SD-NEXT:    smov w14, v0.b[7]
+; CHECK-SD-NEXT:    asr x17, x0, #37
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    smov x0, v1.b[5]
+; CHECK-SD-NEXT:    asr x13, x13, #37
+; CHECK-SD-NEXT:    msub w12, w12, w9, w14
+; CHECK-SD-NEXT:    smov w14, v1.b[3]
+; CHECK-SD-NEXT:    add w17, w17, w17, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[4], w15
+; CHECK-SD-NEXT:    smov w15, v0.b[8]
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    mov v3.b[2], w16
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    msub w14, w17, w9, w14
+; CHECK-SD-NEXT:    smov x17, v1.b[6]
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    smov x16, v0.b[9]
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    msub w13, w13, w9, w15
+; CHECK-SD-NEXT:    smov x15, v0.b[10]
+; CHECK-SD-NEXT:    mov v2.b[5], w10
+; CHECK-SD-NEXT:    smov w10, v1.b[4]
+; CHECK-SD-NEXT:    mov v3.b[3], w14
+; CHECK-SD-NEXT:    asr x14, x0, #37
+; CHECK-SD-NEXT:    smov x0, v1.b[7]
+; CHECK-SD-NEXT:    msub w10, w18, w9, w10
+; CHECK-SD-NEXT:    smov w18, v1.b[5]
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    add w14, w14, w14, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[6], w11
+; CHECK-SD-NEXT:    smull x16, w16, w8
+; CHECK-SD-NEXT:    smov x11, v0.b[11]
+; CHECK-SD-NEXT:    mov v3.b[4], w10
+; CHECK-SD-NEXT:    msub w14, w14, w9, w18
+; CHECK-SD-NEXT:    smov x18, v1.b[8]
+; CHECK-SD-NEXT:    asr x10, x17, #37
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    smov w17, v0.b[9]
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    asr x16, x16, #37
+; CHECK-SD-NEXT:    mov v2.b[7], w12
+; CHECK-SD-NEXT:    smov w12, v1.b[6]
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    mov v3.b[5], w14
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    add w16, w16, w16, lsr #31
+; CHECK-SD-NEXT:    msub w10, w10, w9, w12
+; CHECK-SD-NEXT:    asr x12, x0, #37
+; CHECK-SD-NEXT:    asr x14, x15, #37
+; CHECK-SD-NEXT:    smull x15, w18, w8
+; CHECK-SD-NEXT:    smov x18, v1.b[9]
+; CHECK-SD-NEXT:    smov w0, v1.b[7]
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    add w14, w14, w14, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[8], w13
+; CHECK-SD-NEXT:    mov v3.b[6], w10
+; CHECK-SD-NEXT:    smov w10, v0.b[10]
+; CHECK-SD-NEXT:    msub w16, w16, w9, w17
+; CHECK-SD-NEXT:    msub w12, w12, w9, w0
+; CHECK-SD-NEXT:    asr x13, x15, #37
+; CHECK-SD-NEXT:    smov x17, v1.b[10]
+; CHECK-SD-NEXT:    smull x15, w18, w8
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    smov x18, v0.b[12]
+; CHECK-SD-NEXT:    msub w10, w14, w9, w10
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[8]
+; CHECK-SD-NEXT:    mov v3.b[7], w12
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w12, v0.b[11]
+; CHECK-SD-NEXT:    msub w13, w13, w9, w14
+; CHECK-SD-NEXT:    asr x14, x15, #37
+; CHECK-SD-NEXT:    smov x0, v0.b[13]
+; CHECK-SD-NEXT:    smull x15, w17, w8
+; CHECK-SD-NEXT:    smov x17, v1.b[11]
+; CHECK-SD-NEXT:    mov v2.b[9], w16
+; CHECK-SD-NEXT:    msub w11, w11, w9, w12
+; CHECK-SD-NEXT:    add w12, w14, w14, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[9]
+; CHECK-SD-NEXT:    mov v3.b[8], w13
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    msub w12, w12, w9, w14
+; CHECK-SD-NEXT:    asr x13, x15, #37
+; CHECK-SD-NEXT:    smov x15, v1.b[12]
+; CHECK-SD-NEXT:    smull x14, w17, w8
+; CHECK-SD-NEXT:    smov w17, v1.b[10]
+; CHECK-SD-NEXT:    mov v2.b[10], w10
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    asr x16, x18, #37
+; CHECK-SD-NEXT:    smull x18, w0, w8
+; CHECK-SD-NEXT:    mov v3.b[9], w12
+; CHECK-SD-NEXT:    smov x12, v1.b[13]
+; CHECK-SD-NEXT:    smov x0, v0.b[14]
+; CHECK-SD-NEXT:    msub w10, w13, w9, w17
+; CHECK-SD-NEXT:    asr x13, x14, #37
+; CHECK-SD-NEXT:    add w14, w16, w16, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    asr x17, x18, #37
+; CHECK-SD-NEXT:    smov w18, v1.b[11]
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    smov w16, v0.b[12]
+; CHECK-SD-NEXT:    mov v2.b[11], w11
+; CHECK-SD-NEXT:    mov v3.b[10], w10
+; CHECK-SD-NEXT:    smull x10, w12, w8
+; CHECK-SD-NEXT:    msub w11, w13, w9, w18
+; CHECK-SD-NEXT:    asr x12, x15, #37
+; CHECK-SD-NEXT:    smov x15, v1.b[14]
+; CHECK-SD-NEXT:    msub w13, w14, w9, w16
+; CHECK-SD-NEXT:    smov w16, v1.b[12]
+; CHECK-SD-NEXT:    add w14, w17, w17, lsr #31
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    smov w17, v0.b[13]
+; CHECK-SD-NEXT:    mov v3.b[11], w11
+; CHECK-SD-NEXT:    smull x18, w0, w8
+; CHECK-SD-NEXT:    smov x11, v0.b[15]
+; CHECK-SD-NEXT:    msub w12, w12, w9, w16
+; CHECK-SD-NEXT:    smov x0, v1.b[15]
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    smov w16, v1.b[13]
+; CHECK-SD-NEXT:    mov v2.b[12], w13
+; CHECK-SD-NEXT:    msub w13, w14, w9, w17
+; CHECK-SD-NEXT:    mov v3.b[12], w12
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    asr x12, x18, #37
+; CHECK-SD-NEXT:    asr x14, x15, #37
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smov w15, v0.b[14]
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[13], w13
+; CHECK-SD-NEXT:    add w13, w14, w14, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[14]
+; CHECK-SD-NEXT:    mov v3.b[13], w10
+; CHECK-SD-NEXT:    msub w12, w12, w9, w15
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    msub w10, w13, w9, w14
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    smov w13, v0.b[15]
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[15]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[14], w12
+; CHECK-SD-NEXT:    mov v3.b[14], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w13
+; CHECK-SD-NEXT:    msub w8, w8, w9, w14
+; CHECK-SD-NEXT:    mov v2.b[15], w11
+; CHECK-SD-NEXT:    mov v3.b[15], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv32i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -88
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sshll v4.8h, v1.8b, #0
+; CHECK-GI-NEXT:    movi v20.8b, #100
+; CHECK-GI-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v20.8h, v20.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w4, s5
+; CHECK-GI-NEXT:    fmov w19, s4
+; CHECK-GI-NEXT:    sshll v23.4s, v20.4h, #0
+; CHECK-GI-NEXT:    sshll2 v20.4s, v20.8h, #0
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[1]
+; CHECK-GI-NEXT:    sdiv w17, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[1]
+; CHECK-GI-NEXT:    fmov s6, w12
+; CHECK-GI-NEXT:    sdiv w13, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[2]
+; CHECK-GI-NEXT:    fmov s7, w17
+; CHECK-GI-NEXT:    sdiv w18, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[2]
+; CHECK-GI-NEXT:    mov v6.s[1], w13
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov v7.s[1], w18
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v24.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sdiv w16, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v6.s[2], w11
+; CHECK-GI-NEXT:    fmov w14, s2
+; CHECK-GI-NEXT:    sdiv w23, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[1]
+; CHECK-GI-NEXT:    mov v7.s[2], w16
+; CHECK-GI-NEXT:    sdiv w20, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[1]
+; CHECK-GI-NEXT:    fmov s19, w23
+; CHECK-GI-NEXT:    sdiv w24, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[2]
+; CHECK-GI-NEXT:    fmov s18, w20
+; CHECK-GI-NEXT:    sdiv w2, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov v19.s[1], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w21, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[2]
+; CHECK-GI-NEXT:    fmov s16, w2
+; CHECK-GI-NEXT:    sdiv w22, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[3]
+; CHECK-GI-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-GI-NEXT:    mov v18.s[1], w21
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v25.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sdiv w1, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[2]
+; CHECK-GI-NEXT:    mov v19.s[2], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w7, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NEXT:    fmov w25, s5
+; CHECK-GI-NEXT:    mov w26, v5.s[1]
+; CHECK-GI-NEXT:    mov w27, v5.s[2]
+; CHECK-GI-NEXT:    mov w28, v5.s[3]
+; CHECK-GI-NEXT:    sshll2 v5.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    fmov w29, s5
+; CHECK-GI-NEXT:    mov w30, v5.s[1]
+; CHECK-GI-NEXT:    mov w11, v5.s[3]
+; CHECK-GI-NEXT:    sdiv w15, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v3.8h, #0
+; CHECK-GI-NEXT:    mov v18.s[2], w7
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    fmov w0, s2
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v16.s[2], w15
+; CHECK-GI-NEXT:    sdiv w6, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[1]
+; CHECK-GI-NEXT:    str w9, [sp, #12] // 4-byte Folded Spill
+; CHECK-GI-NEXT:    mov w9, v5.s[2]
+; CHECK-GI-NEXT:    sdiv w25, w25, w8
+; CHECK-GI-NEXT:    fmov s17, w6
+; CHECK-GI-NEXT:    sdiv w29, w29, w8
+; CHECK-GI-NEXT:    fmov s21, w25
+; CHECK-GI-NEXT:    sdiv w5, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[2]
+; CHECK-GI-NEXT:    fmov s22, w29
+; CHECK-GI-NEXT:    sdiv w26, w26, w8
+; CHECK-GI-NEXT:    mov v17.s[1], w5
+; CHECK-GI-NEXT:    sdiv w30, w30, w8
+; CHECK-GI-NEXT:    mov v21.s[1], w26
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w3, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[3]
+; CHECK-GI-NEXT:    mov v22.s[1], w30
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w27, w27, w8
+; CHECK-GI-NEXT:    mov v17.s[2], w3
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v21.s[2], w27
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    mov v22.s[2], w9
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v7.s[3], w10
+; CHECK-GI-NEXT:    mls v0.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v16.s[3], w14
+; CHECK-GI-NEXT:    mls v3.4s, v16.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v17.s[3], w0
+; CHECK-GI-NEXT:    mls v2.4s, v17.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w19, w19, w8
+; CHECK-GI-NEXT:    mov v18.s[3], w4
+; CHECK-GI-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-GI-NEXT:    mls v25.4s, v18.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w28, w28, w8
+; CHECK-GI-NEXT:    mov v19.s[3], w19
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mls v1.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w8, w11, w8
+; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; CHECK-GI-NEXT:    mov v21.s[3], w28
+; CHECK-GI-NEXT:    uzp1 v1.8h, v25.8h, v1.8h
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v6.s[3], w11
+; CHECK-GI-NEXT:    mls v4.4s, v21.4s, v23.4s
+; CHECK-GI-NEXT:    mls v24.4s, v6.4s, v23.4s
+; CHECK-GI-NEXT:    mov v22.s[3], w8
+; CHECK-GI-NEXT:    uzp1 v0.8h, v24.8h, v0.8h
+; CHECK-GI-NEXT:    mls v5.4s, v22.4s, v20.4s
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <32 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <32 x i8> %s
+}
+
+define <2 x i8> @uv2i8_7(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: uv2i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi v2.2s, #37
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    and v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    mov v2.b[1], w9
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    umov w8, v2.b[0]
+; CHECK-GI-NEXT:    umov w9, v2.b[1]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w9, v1.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i8> %d, <i8 7, i8 7>
+  ret <2 x i8> %s
+}
+
+define <2 x i8> @uv2i8_100(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: uv2i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi v2.2s, #41
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    and v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w9, v1.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i8> %d, <i8 100, i8 100>
+  ret <2 x i8> %s
+}
+
+define <3 x i8> @uv3i8_7(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: uv3i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0xff
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    and w12, w2, #0xff
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w9, w11
+; CHECK-SD-NEXT:    add w1, w10, w13
+; CHECK-SD-NEXT:    add w2, w12, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    mov w10, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    and w9, w2, #0xff
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mul v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    mov b3, v2.b[1]
+; CHECK-GI-NEXT:    mov b4, v2.b[2]
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    add v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    mov b2, v1.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[2]
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-NEXT:    umov w2, v0.h[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i8> %d, <i8 7, i8 7, i8 7>
+  ret <3 x i8> %s
+}
+
+define <3 x i8> @uv3i8_100(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: uv3i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0xff
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    and w12, w2, #0xff
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w0, w11, w14, w9
+; CHECK-SD-NEXT:    msub w1, w13, w14, w10
+; CHECK-SD-NEXT:    msub w2, w8, w14, w12
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    mov w10, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    and w9, w2, #0xff
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w10
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    neg v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    neg v1.8b, v3.8b
+; CHECK-GI-NEXT:    fmov s3, w0
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.h[1], w1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.h[2], w2
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mls v3.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umov w0, v3.h[0]
+; CHECK-GI-NEXT:    umov w1, v3.h[1]
+; CHECK-GI-NEXT:    umov w2, v3.h[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i8> %d, <i8 100, i8 100, i8 100>
+  ret <3 x i8> %s
+}
+
+define <4 x i8> @uv4i8_7(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: uv4i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    umov w13, v0.h[2]
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    umull x14, w13, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    umull x8, w15, w8
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    lsr x9, x14, #32
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    add w8, w15, w8
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ushr v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    mov v4.b[1], w8
+; CHECK-GI-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    mov v4.b[2], w8
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    mov v4.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    usra v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    uzp1 v1.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    dup v2.4h, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
+  ret <4 x i8> %s
+}
+
+define <4 x i8> @uv4i8_100(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: uv4i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    msub w9, w11, w14, w9
+; CHECK-SD-NEXT:    umull x11, w12, w8
+; CHECK-SD-NEXT:    msub w10, w13, w14, w10
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    umull x8, w15, w8
+; CHECK-SD-NEXT:    lsr x9, x11, #32
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    msub w9, w9, w14, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w8, w8, w14, w15
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    dup v2.4h, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
+  ret <4 x i8> %s
+}
+
+define <8 x i8> @uv8i8_7(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: uv8i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #37
+; CHECK-SD-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    sub v2.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-SD-NEXT:    shrn v2.8b, v2.8h, #1
+; CHECK-SD-NEXT:    add v1.8b, v2.8b, v1.8b
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    ushr v1.8b, v1.8b, #2
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.8b, #37
+; CHECK-GI-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    sub v2.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usra v1.8b, v2.8b, #1
+; CHECK-GI-NEXT:    movi v2.8b, #7
+; CHECK-GI-NEXT:    ushr v1.8b, v1.8b, #2
+; CHECK-GI-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <8 x i8> %s
+}
+
+define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-LABEL: uv8i8_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.8b, #41
+; CHECK-NEXT:    movi v2.8b, #100
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    ushr v1.8b, v1.8b, #4
+; CHECK-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @uv16i8_7(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: uv16i8_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.16b, #37
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    usra v1.16b, v2.16b, #1
+; CHECK-NEXT:    movi v2.16b, #7
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #2
+; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @uv16i8_100(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: uv16i8_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.16b, #41
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v2.16b, #100
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #4
+; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <16 x i8> %s
+}
+
+define <32 x i8> @uv32i8_7(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: uv32i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w10, v0.b[0]
+; CHECK-SD-NEXT:    umov w13, v1.b[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.b[1]
+; CHECK-SD-NEXT:    umov w12, v1.b[1]
+; CHECK-SD-NEXT:    umov w17, v0.b[2]
+; CHECK-SD-NEXT:    umov w0, v1.b[2]
+; CHECK-SD-NEXT:    umov w1, v1.b[3]
+; CHECK-SD-NEXT:    umull x14, w10, w8
+; CHECK-SD-NEXT:    umull x16, w13, w8
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x15, w12, w8
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umull x2, w17, w8
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    lsr x18, x11, #32
+; CHECK-SD-NEXT:    umov w11, v0.b[3]
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    add w10, w10, w14
+; CHECK-SD-NEXT:    umull x14, w0, w8
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w13, w13, w16
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    fmov s2, w10
+; CHECK-SD-NEXT:    fmov s3, w13
+; CHECK-SD-NEXT:    add w18, w9, w18
+; CHECK-SD-NEXT:    lsr x10, x2, #32
+; CHECK-SD-NEXT:    add w12, w12, w15
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    umov w9, v0.b[4]
+; CHECK-SD-NEXT:    umov w15, v1.b[4]
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    mov v2.b[1], w18
+; CHECK-SD-NEXT:    umull x13, w1, w8
+; CHECK-SD-NEXT:    mov v3.b[1], w12
+; CHECK-SD-NEXT:    sub w2, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w12, w14, w14, lsl #3
+; CHECK-SD-NEXT:    umov w10, v0.b[5]
+; CHECK-SD-NEXT:    umov w18, v1.b[5]
+; CHECK-SD-NEXT:    add w14, w17, w2
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    add w12, w0, w12
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    umull x0, w15, w8
+; CHECK-SD-NEXT:    mov v2.b[2], w14
+; CHECK-SD-NEXT:    mov v3.b[2], w12
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umov w12, v0.b[6]
+; CHECK-SD-NEXT:    add w11, w11, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w14, v1.b[6]
+; CHECK-SD-NEXT:    add w13, w1, w13
+; CHECK-SD-NEXT:    umull x16, w10, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w18, w8
+; CHECK-SD-NEXT:    mov v2.b[3], w11
+; CHECK-SD-NEXT:    mov v3.b[3], w13
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    umov w11, v0.b[7]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w13, v1.b[7]
+; CHECK-SD-NEXT:    umull x17, w12, w8
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w14, w8
+; CHECK-SD-NEXT:    mov v2.b[4], w9
+; CHECK-SD-NEXT:    mov v3.b[4], w15
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umov w9, v0.b[8]
+; CHECK-SD-NEXT:    add w10, w10, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w15, v1.b[8]
+; CHECK-SD-NEXT:    add w18, w18, w1
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w13, w8
+; CHECK-SD-NEXT:    mov v2.b[5], w10
+; CHECK-SD-NEXT:    mov v3.b[5], w18
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w12, w12, w17
+; CHECK-SD-NEXT:    umov w10, v0.b[9]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w18, v1.b[9]
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    add w14, w14, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w15, w8
+; CHECK-SD-NEXT:    mov v2.b[6], w12
+; CHECK-SD-NEXT:    mov v3.b[6], w14
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    umov w12, v0.b[10]
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    add w11, w11, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w14, v1.b[10]
+; CHECK-SD-NEXT:    add w13, w13, w1
+; CHECK-SD-NEXT:    umull x16, w10, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w18, w8
+; CHECK-SD-NEXT:    mov v2.b[7], w11
+; CHECK-SD-NEXT:    mov v3.b[7], w13
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    umull x17, w12, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w11, v0.b[11]
+; CHECK-SD-NEXT:    umov w13, v1.b[11]
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w14, w8
+; CHECK-SD-NEXT:    mov v2.b[8], w9
+; CHECK-SD-NEXT:    mov v3.b[8], w15
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    umov w9, v0.b[12]
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    add w10, w10, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w15, v1.b[12]
+; CHECK-SD-NEXT:    add w18, w18, w1
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w13, w8
+; CHECK-SD-NEXT:    mov v2.b[9], w10
+; CHECK-SD-NEXT:    mov v3.b[9], w18
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w12, w12, w17
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w10, v0.b[13]
+; CHECK-SD-NEXT:    umov w18, v1.b[13]
+; CHECK-SD-NEXT:    add w14, w14, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w15, w8
+; CHECK-SD-NEXT:    mov v2.b[10], w12
+; CHECK-SD-NEXT:    mov v3.b[10], w14
+; CHECK-SD-NEXT:    sub w12, w16, w16, lsl #3
+; CHECK-SD-NEXT:    lsr x16, x17, #32
+; CHECK-SD-NEXT:    sub w17, w1, w1, lsl #3
+; CHECK-SD-NEXT:    umov w14, v0.b[14]
+; CHECK-SD-NEXT:    add w11, w11, w12
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    add w13, w13, w17
+; CHECK-SD-NEXT:    umull x17, w18, w8
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    mov v2.b[11], w11
+; CHECK-SD-NEXT:    mov v3.b[11], w13
+; CHECK-SD-NEXT:    umov w11, v1.b[14]
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w16
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umov w13, v0.b[15]
+; CHECK-SD-NEXT:    umov w16, v1.b[15]
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    mov v2.b[12], w9
+; CHECK-SD-NEXT:    umull x9, w14, w8
+; CHECK-SD-NEXT:    mov v3.b[12], w15
+; CHECK-SD-NEXT:    umull x15, w11, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    add w12, w18, w17
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    mov v2.b[13], w10
+; CHECK-SD-NEXT:    umull x10, w13, w8
+; CHECK-SD-NEXT:    mov v3.b[13], w12
+; CHECK-SD-NEXT:    umull x8, w16, w8
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w12, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w9, w14, w9
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w11, w11, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v2.b[14], w9
+; CHECK-SD-NEXT:    mov v3.b[14], w11
+; CHECK-SD-NEXT:    sub w9, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    add w8, w16, w8
+; CHECK-SD-NEXT:    mov v2.b[15], w9
+; CHECK-SD-NEXT:    mov v3.b[15], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv32i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.8b, #37
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    umull v4.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    umull v6.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umull v3.8h, v3.8b, v2.8b
+; CHECK-GI-NEXT:    umull v2.8h, v5.8b, v2.8b
+; CHECK-GI-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-GI-NEXT:    shrn v5.8b, v6.8h, #8
+; CHECK-GI-NEXT:    mov v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.16b, v5.16b
+; CHECK-GI-NEXT:    shrn2 v4.16b, v3.8h, #8
+; CHECK-GI-NEXT:    shrn2 v5.16b, v2.8h, #8
+; CHECK-GI-NEXT:    shrn2 v6.16b, v3.8h, #8
+; CHECK-GI-NEXT:    shrn2 v7.16b, v2.8h, #8
+; CHECK-GI-NEXT:    movi v2.16b, #7
+; CHECK-GI-NEXT:    sub v6.16b, v0.16b, v6.16b
+; CHECK-GI-NEXT:    sub v7.16b, v1.16b, v7.16b
+; CHECK-GI-NEXT:    usra v4.16b, v6.16b, #1
+; CHECK-GI-NEXT:    usra v5.16b, v7.16b, #1
+; CHECK-GI-NEXT:    ushr v3.16b, v4.16b, #2
+; CHECK-GI-NEXT:    ushr v4.16b, v5.16b, #2
+; CHECK-GI-NEXT:    mls v0.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    mls v1.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <32 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <32 x i8> %s
+}
+
+define <32 x i8> @uv32i8_100(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: uv32i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w11, v0.b[0]
+; CHECK-SD-NEXT:    umov w14, v1.b[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w10, v0.b[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w13, v1.b[1]
+; CHECK-SD-NEXT:    umov w12, v0.b[2]
+; CHECK-SD-NEXT:    umov w17, v1.b[2]
+; CHECK-SD-NEXT:    umull x15, w11, w8
+; CHECK-SD-NEXT:    umull x1, w14, w8
+; CHECK-SD-NEXT:    umull x9, w10, w8
+; CHECK-SD-NEXT:    umull x18, w13, w8
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x16, w12, w8
+; CHECK-SD-NEXT:    lsr x0, x9, #32
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w11, w15, w9, w11
+; CHECK-SD-NEXT:    umov w15, v0.b[3]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    msub w14, w1, w9, w14
+; CHECK-SD-NEXT:    umov w1, v1.b[3]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    msub w10, w0, w9, w10
+; CHECK-SD-NEXT:    umull x0, w17, w8
+; CHECK-SD-NEXT:    fmov s2, w11
+; CHECK-SD-NEXT:    umov w11, v0.b[5]
+; CHECK-SD-NEXT:    msub w13, w18, w9, w13
+; CHECK-SD-NEXT:    fmov s3, w14
+; CHECK-SD-NEXT:    umov w14, v1.b[4]
+; CHECK-SD-NEXT:    msub w12, w16, w9, w12
+; CHECK-SD-NEXT:    umov w16, v0.b[4]
+; CHECK-SD-NEXT:    umull x18, w15, w8
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    mov v2.b[1], w10
+; CHECK-SD-NEXT:    mov v3.b[1], w13
+; CHECK-SD-NEXT:    umull x13, w1, w8
+; CHECK-SD-NEXT:    msub w17, w0, w9, w17
+; CHECK-SD-NEXT:    umov w0, v0.b[6]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    umull x10, w16, w8
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    mov v2.b[2], w12
+; CHECK-SD-NEXT:    umull x12, w11, w8
+; CHECK-SD-NEXT:    msub w15, w18, w9, w15
+; CHECK-SD-NEXT:    umov w18, v1.b[5]
+; CHECK-SD-NEXT:    mov v3.b[2], w17
+; CHECK-SD-NEXT:    umull x17, w14, w8
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    msub w13, w13, w9, w1
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    umov w1, v0.b[7]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    umov w16, v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[3], w15
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umull x15, w0, w8
+; CHECK-SD-NEXT:    mov v3.b[3], w13
+; CHECK-SD-NEXT:    umull x13, w18, w8
+; CHECK-SD-NEXT:    msub w14, w17, w9, w14
+; CHECK-SD-NEXT:    umov w17, v0.b[8]
+; CHECK-SD-NEXT:    msub w11, w12, w9, w11
+; CHECK-SD-NEXT:    umov w12, v1.b[7]
+; CHECK-SD-NEXT:    mov v2.b[4], w10
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    umull x10, w1, w8
+; CHECK-SD-NEXT:    mov v3.b[4], w14
+; CHECK-SD-NEXT:    umull x14, w16, w8
+; CHECK-SD-NEXT:    msub w13, w13, w9, w18
+; CHECK-SD-NEXT:    umov w18, v0.b[9]
+; CHECK-SD-NEXT:    msub w15, w15, w9, w0
+; CHECK-SD-NEXT:    umov w0, v1.b[8]
+; CHECK-SD-NEXT:    mov v2.b[5], w11
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umull x11, w17, w8
+; CHECK-SD-NEXT:    mov v3.b[5], w13
+; CHECK-SD-NEXT:    umull x13, w12, w8
+; CHECK-SD-NEXT:    msub w14, w14, w9, w16
+; CHECK-SD-NEXT:    umov w16, v0.b[10]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w1
+; CHECK-SD-NEXT:    umov w1, v1.b[9]
+; CHECK-SD-NEXT:    mov v2.b[6], w15
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    umull x15, w18, w8
+; CHECK-SD-NEXT:    mov v3.b[6], w14
+; CHECK-SD-NEXT:    umull x14, w0, w8
+; CHECK-SD-NEXT:    msub w12, w13, w9, w12
+; CHECK-SD-NEXT:    umov w13, v0.b[11]
+; CHECK-SD-NEXT:    msub w11, w11, w9, w17
+; CHECK-SD-NEXT:    umov w17, v1.b[10]
+; CHECK-SD-NEXT:    mov v2.b[7], w10
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    umull x10, w16, w8
+; CHECK-SD-NEXT:    mov v3.b[7], w12
+; CHECK-SD-NEXT:    umull x12, w1, w8
+; CHECK-SD-NEXT:    msub w14, w14, w9, w0
+; CHECK-SD-NEXT:    umov w0, v1.b[11]
+; CHECK-SD-NEXT:    msub w15, w15, w9, w18
+; CHECK-SD-NEXT:    mov v2.b[8], w11
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    umull x18, w17, w8
+; CHECK-SD-NEXT:    mov v3.b[8], w14
+; CHECK-SD-NEXT:    umov w14, v0.b[12]
+; CHECK-SD-NEXT:    umull x11, w13, w8
+; CHECK-SD-NEXT:    msub w12, w12, w9, w1
+; CHECK-SD-NEXT:    umull x1, w0, w8
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    mov v2.b[9], w15
+; CHECK-SD-NEXT:    umov w15, v1.b[12]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    mov v3.b[9], w12
+; CHECK-SD-NEXT:    msub w16, w18, w9, w17
+; CHECK-SD-NEXT:    umov w12, v0.b[13]
+; CHECK-SD-NEXT:    lsr x18, x1, #32
+; CHECK-SD-NEXT:    umull x17, w14, w8
+; CHECK-SD-NEXT:    mov v2.b[10], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w13
+; CHECK-SD-NEXT:    umov w13, v0.b[14]
+; CHECK-SD-NEXT:    msub w18, w18, w9, w0
+; CHECK-SD-NEXT:    umov w0, v1.b[13]
+; CHECK-SD-NEXT:    umull x10, w15, w8
+; CHECK-SD-NEXT:    mov v3.b[10], w16
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umull x16, w12, w8
+; CHECK-SD-NEXT:    msub w14, w17, w9, w14
+; CHECK-SD-NEXT:    umov w17, v1.b[14]
+; CHECK-SD-NEXT:    mov v2.b[11], w11
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umull x11, w13, w8
+; CHECK-SD-NEXT:    mov v3.b[11], w18
+; CHECK-SD-NEXT:    umull x18, w0, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    msub w10, w10, w9, w15
+; CHECK-SD-NEXT:    umov w15, v0.b[15]
+; CHECK-SD-NEXT:    msub w12, w16, w9, w12
+; CHECK-SD-NEXT:    mov v2.b[12], w14
+; CHECK-SD-NEXT:    umov w14, v1.b[15]
+; CHECK-SD-NEXT:    lsr x16, x18, #32
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    mov v3.b[12], w10
+; CHECK-SD-NEXT:    umull x10, w17, w8
+; CHECK-SD-NEXT:    msub w16, w16, w9, w0
+; CHECK-SD-NEXT:    msub w11, w11, w9, w13
+; CHECK-SD-NEXT:    mov v2.b[13], w12
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umull x13, w15, w8
+; CHECK-SD-NEXT:    mov v3.b[13], w16
+; CHECK-SD-NEXT:    umull x8, w14, w8
+; CHECK-SD-NEXT:    msub w10, w10, w9, w17
+; CHECK-SD-NEXT:    lsr x12, x13, #32
+; CHECK-SD-NEXT:    mov v2.b[14], w11
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v3.b[14], w10
+; CHECK-SD-NEXT:    msub w11, w12, w9, w15
+; CHECK-SD-NEXT:    msub w8, w8, w9, w14
+; CHECK-SD-NEXT:    mov v2.b[15], w11
+; CHECK-SD-NEXT:    mov v3.b[15], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv32i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.8b, #41
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    umull v5.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    umull v6.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umull v3.8h, v3.8b, v2.8b
+; CHECK-GI-NEXT:    umull v2.8h, v4.8b, v2.8b
+; CHECK-GI-NEXT:    shrn v4.8b, v5.8h, #8
+; CHECK-GI-NEXT:    shrn v5.8b, v6.8h, #8
+; CHECK-GI-NEXT:    shrn2 v4.16b, v3.8h, #8
+; CHECK-GI-NEXT:    shrn2 v5.16b, v2.8h, #8
+; CHECK-GI-NEXT:    movi v2.16b, #100
+; CHECK-GI-NEXT:    ushr v3.16b, v4.16b, #4
+; CHECK-GI-NEXT:    ushr v4.16b, v5.16b, #4
+; CHECK-GI-NEXT:    mls v0.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    mls v1.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <32 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <32 x i8> %s
+}
+
+define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: sv2i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v1.2s, #16
+; CHECK-SD-NEXT:    smull v2.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #32
+; CHECK-SD-NEXT:    ssra v2.2s, v1.2s, #16
+; CHECK-SD-NEXT:    sshr v1.2s, v2.2s, #2
+; CHECK-SD-NEXT:    usra v1.2s, v2.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i16> %d, <i16 7, i16 7>
+  ret <2 x i16> %s
+}
+
+define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: sv2i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i16> %d, <i16 100, i16 100>
+  ret <2 x i16> %s
+}
+
+define <3 x i16> @sv3i16_7(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: sv3i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov w12, v0.h[0]
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x8, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.h[2]
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w9, w9, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    asr w14, w9, #2
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    asr w15, w10, #2
+; CHECK-SD-NEXT:    add w8, w8, w11
+; CHECK-SD-NEXT:    add w9, w14, w9, lsr #31
+; CHECK-SD-NEXT:    asr w14, w8, #2
+; CHECK-SD-NEXT:    add w10, w15, w10, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    add w8, w14, w8, lsr #31
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    add w9, w12, w9
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w13, w10
+; CHECK-SD-NEXT:    add w8, w11, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w13, v0.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w9, w9, w10
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w8, w13, w8
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w10, w15, w12
+; CHECK-GI-NEXT:    sub w10, w11, w10
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w8, w13, w8
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i16> %d, <i16 7, i16 7, i16 7>
+  ret <3 x i16> %s
+}
+
+define <3 x i16> @sv3i16_100(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: sv3i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x8, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.h[0]
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    msub w9, w9, w12, w11
+; CHECK-SD-NEXT:    smov w11, v0.h[2]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    msub w10, w10, w12, w13
+; CHECK-SD-NEXT:    msub w8, w8, w12, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w13, v0.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    msub w9, w10, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w14, w13, w8
+; CHECK-GI-NEXT:    msub w10, w12, w8, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    msub w8, w14, w8, w13
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i16> %d, <i16 100, i16 100, i16 100>
+  ret <3 x i16> %s
+}
+
+define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: sv4i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movi v2.4h, #7
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #17
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    usra v1.4h, v1.4h, #15
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v2.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
+  ret <4 x i16> %s
+}
+
+define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: sv4i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    movi v2.4h, #100
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #19
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    usra v1.4h, v1.4h, #15
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v2.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: sv8i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    usra v1.8h, v1.8h, #15
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v4.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: sv8i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #3
+; CHECK-SD-NEXT:    usra v1.8h, v1.8h, #15
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v4.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %s
+}
+
+define <16 x i16> @sv16i16_7(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: sv16i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smov x10, v0.h[0]
+; CHECK-SD-NEXT:    smov x9, v0.h[1]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    smov w15, v0.h[0]
+; CHECK-SD-NEXT:    smov w12, v0.h[1]
+; CHECK-SD-NEXT:    smov x18, v0.h[3]
+; CHECK-SD-NEXT:    smov w14, v0.h[2]
+; CHECK-SD-NEXT:    smov x3, v1.h[1]
+; CHECK-SD-NEXT:    smov x16, v0.h[4]
+; CHECK-SD-NEXT:    smov x5, v1.h[0]
+; CHECK-SD-NEXT:    smull x0, w10, w8
+; CHECK-SD-NEXT:    smov w13, v0.h[3]
+; CHECK-SD-NEXT:    smov w10, v0.h[4]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smov x17, v0.h[5]
+; CHECK-SD-NEXT:    smull x1, w11, w8
+; CHECK-SD-NEXT:    smov x11, v0.h[6]
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    lsr x2, x9, #32
+; CHECK-SD-NEXT:    smull x3, w3, w8
+; CHECK-SD-NEXT:    smov w9, v0.h[5]
+; CHECK-SD-NEXT:    add w0, w0, w15
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    smull x7, w16, w8
+; CHECK-SD-NEXT:    add w2, w2, w12
+; CHECK-SD-NEXT:    asr w6, w0, #2
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    asr w4, w2, #2
+; CHECK-SD-NEXT:    add w1, w1, w14
+; CHECK-SD-NEXT:    smov w16, v1.h[1]
+; CHECK-SD-NEXT:    add w0, w6, w0, lsr #31
+; CHECK-SD-NEXT:    smull x5, w5, w8
+; CHECK-SD-NEXT:    add w18, w18, w13
+; CHECK-SD-NEXT:    add w2, w4, w2, lsr #31
+; CHECK-SD-NEXT:    asr w4, w1, #2
+; CHECK-SD-NEXT:    lsr x3, x3, #32
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    asr w6, w18, #2
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    add w1, w4, w1, lsr #31
+; CHECK-SD-NEXT:    smov x4, v1.h[2]
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    smov w0, v1.h[0]
+; CHECK-SD-NEXT:    add w3, w3, w16
+; CHECK-SD-NEXT:    fmov s2, w15
+; CHECK-SD-NEXT:    add w2, w12, w2
+; CHECK-SD-NEXT:    lsr x5, x5, #32
+; CHECK-SD-NEXT:    add w18, w6, w18, lsr #31
+; CHECK-SD-NEXT:    lsr x6, x7, #32
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    smull x4, w4, w8
+; CHECK-SD-NEXT:    smov w7, v1.h[3]
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    mov v2.h[1], w2
+; CHECK-SD-NEXT:    asr w2, w3, #2
+; CHECK-SD-NEXT:    add w5, w5, w0
+; CHECK-SD-NEXT:    add w6, w6, w10
+; CHECK-SD-NEXT:    add w14, w14, w1
+; CHECK-SD-NEXT:    smov x1, v1.h[3]
+; CHECK-SD-NEXT:    add w2, w2, w3, lsr #31
+; CHECK-SD-NEXT:    lsr x3, x4, #32
+; CHECK-SD-NEXT:    asr w4, w5, #2
+; CHECK-SD-NEXT:    asr w15, w6, #2
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w17, w17, w9
+; CHECK-SD-NEXT:    add w4, w4, w5, lsr #31
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    mov v2.h[2], w14
+; CHECK-SD-NEXT:    add w6, w15, w6, lsr #31
+; CHECK-SD-NEXT:    smov w15, v1.h[2]
+; CHECK-SD-NEXT:    add w13, w13, w18
+; CHECK-SD-NEXT:    sub w4, w4, w4, lsl #3
+; CHECK-SD-NEXT:    smov x18, v1.h[4]
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    add w16, w16, w2
+; CHECK-SD-NEXT:    asr w5, w17, #2
+; CHECK-SD-NEXT:    sub w6, w6, w6, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w4
+; CHECK-SD-NEXT:    mov v2.h[3], w13
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    add w3, w3, w15
+; CHECK-SD-NEXT:    fmov s3, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    asr w2, w3, #2
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    add w10, w10, w6
+; CHECK-SD-NEXT:    add w17, w5, w17, lsr #31
+; CHECK-SD-NEXT:    smov w12, v0.h[6]
+; CHECK-SD-NEXT:    smov x14, v0.h[7]
+; CHECK-SD-NEXT:    add w0, w2, w3, lsr #31
+; CHECK-SD-NEXT:    smov w2, v1.h[4]
+; CHECK-SD-NEXT:    mov v3.h[1], w16
+; CHECK-SD-NEXT:    add w16, w1, w7
+; CHECK-SD-NEXT:    smov x1, v1.h[5]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    asr w3, w16, #2
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    mov v2.h[4], w10
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    smull x14, w14, w8
+; CHECK-SD-NEXT:    add w16, w3, w16, lsr #31
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    add w18, w18, w2
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    smov x3, v1.h[6]
+; CHECK-SD-NEXT:    asr w0, w18, #2
+; CHECK-SD-NEXT:    mov v3.h[2], w15
+; CHECK-SD-NEXT:    sub w15, w16, w16, lsl #3
+; CHECK-SD-NEXT:    smov w16, v1.h[5]
+; CHECK-SD-NEXT:    add w10, w0, w18, lsr #31
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    add w11, w11, w12
+; CHECK-SD-NEXT:    lsr x18, x1, #32
+; CHECK-SD-NEXT:    add w15, w7, w15
+; CHECK-SD-NEXT:    smov x1, v1.h[7]
+; CHECK-SD-NEXT:    smull x0, w3, w8
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    smov w13, v0.h[7]
+; CHECK-SD-NEXT:    mov v3.h[3], w15
+; CHECK-SD-NEXT:    add w15, w18, w16
+; CHECK-SD-NEXT:    smov w18, v1.h[6]
+; CHECK-SD-NEXT:    asr w17, w15, #2
+; CHECK-SD-NEXT:    add w10, w2, w10
+; CHECK-SD-NEXT:    mov v2.h[5], w9
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    smull x8, w1, w8
+; CHECK-SD-NEXT:    lsr x9, x14, #32
+; CHECK-SD-NEXT:    add w15, w17, w15, lsr #31
+; CHECK-SD-NEXT:    asr w17, w11, #2
+; CHECK-SD-NEXT:    mov v3.h[4], w10
+; CHECK-SD-NEXT:    add w10, w0, w18
+; CHECK-SD-NEXT:    smov w0, v1.h[7]
+; CHECK-SD-NEXT:    add w11, w17, w11, lsr #31
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    asr w17, w10, #2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w9, w9, w13
+; CHECK-SD-NEXT:    add w14, w16, w15
+; CHECK-SD-NEXT:    add w10, w17, w10, lsr #31
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[5], w14
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w14, w9, #2
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    asr w15, w8, #2
+; CHECK-SD-NEXT:    add w11, w12, w11
+; CHECK-SD-NEXT:    add w9, w14, w9, lsr #31
+; CHECK-SD-NEXT:    mov v2.h[6], w11
+; CHECK-SD-NEXT:    add w10, w18, w10
+; CHECK-SD-NEXT:    add w8, w15, w8, lsr #31
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    add w8, w0, w8
+; CHECK-SD-NEXT:    mov v2.h[7], w9
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    movi v16.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w17, s3
+; CHECK-GI-NEXT:    mov w18, v3.s[1]
+; CHECK-GI-NEXT:    mov w0, v3.s[2]
+; CHECK-GI-NEXT:    mov w1, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[2]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    mls v1.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %s
+}
+
+define <16 x i16> @sv16i16_100(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: sv16i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smov x10, v0.h[0]
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x14, v1.h[0]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x9, v0.h[1]
+; CHECK-SD-NEXT:    smov x13, v1.h[1]
+; CHECK-SD-NEXT:    smov x17, v1.h[2]
+; CHECK-SD-NEXT:    smov w16, v0.h[0]
+; CHECK-SD-NEXT:    smov w18, v0.h[2]
+; CHECK-SD-NEXT:    smov x0, v0.h[3]
+; CHECK-SD-NEXT:    smov w15, v0.h[1]
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smull x14, w14, w8
+; CHECK-SD-NEXT:    smull x12, w9, w8
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    smull x13, w13, w8
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    asr x14, x14, #37
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    asr x12, x12, #37
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    asr x13, x13, #37
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    add w14, w14, w14, lsr #31
+; CHECK-SD-NEXT:    msub w11, w11, w9, w18
+; CHECK-SD-NEXT:    smov w18, v1.h[0]
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    smov w16, v1.h[1]
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    asr x17, x17, #37
+; CHECK-SD-NEXT:    msub w14, w14, w9, w18
+; CHECK-SD-NEXT:    smov x18, v1.h[3]
+; CHECK-SD-NEXT:    fmov s2, w10
+; CHECK-SD-NEXT:    msub w12, w12, w9, w15
+; CHECK-SD-NEXT:    smov x15, v0.h[4]
+; CHECK-SD-NEXT:    smov w10, v1.h[2]
+; CHECK-SD-NEXT:    msub w13, w13, w9, w16
+; CHECK-SD-NEXT:    smov x16, v0.h[5]
+; CHECK-SD-NEXT:    add w17, w17, w17, lsr #31
+; CHECK-SD-NEXT:    fmov s3, w14
+; CHECK-SD-NEXT:    asr x0, x0, #37
+; CHECK-SD-NEXT:    smov w14, v0.h[4]
+; CHECK-SD-NEXT:    mov v2.h[1], w12
+; CHECK-SD-NEXT:    msub w10, w17, w9, w10
+; CHECK-SD-NEXT:    smov x17, v1.h[4]
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    smov w12, v0.h[3]
+; CHECK-SD-NEXT:    add w0, w0, w0, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    mov v3.h[1], w13
+; CHECK-SD-NEXT:    smov x13, v1.h[5]
+; CHECK-SD-NEXT:    smull x16, w16, w8
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    mov v2.h[2], w11
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    asr x15, x15, #37
+; CHECK-SD-NEXT:    msub w12, w0, w9, w12
+; CHECK-SD-NEXT:    smov w0, v1.h[3]
+; CHECK-SD-NEXT:    asr x16, x16, #37
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    mov v3.h[2], w10
+; CHECK-SD-NEXT:    add w15, w15, w15, lsr #31
+; CHECK-SD-NEXT:    smull x10, w13, w8
+; CHECK-SD-NEXT:    asr x17, x17, #37
+; CHECK-SD-NEXT:    add w13, w16, w16, lsr #31
+; CHECK-SD-NEXT:    msub w16, w18, w9, w0
+; CHECK-SD-NEXT:    smov x11, v0.h[6]
+; CHECK-SD-NEXT:    mov v2.h[3], w12
+; CHECK-SD-NEXT:    smov x12, v1.h[6]
+; CHECK-SD-NEXT:    msub w14, w15, w9, w14
+; CHECK-SD-NEXT:    add w15, w17, w17, lsr #31
+; CHECK-SD-NEXT:    smov w17, v1.h[4]
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    mov v3.h[3], w16
+; CHECK-SD-NEXT:    smov w18, v0.h[5]
+; CHECK-SD-NEXT:    smov x16, v0.h[7]
+; CHECK-SD-NEXT:    msub w15, w15, w9, w17
+; CHECK-SD-NEXT:    smov x0, v1.h[7]
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smov w17, v1.h[5]
+; CHECK-SD-NEXT:    mov v2.h[4], w14
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    msub w13, w13, w9, w18
+; CHECK-SD-NEXT:    mov v3.h[4], w15
+; CHECK-SD-NEXT:    smov w15, v0.h[6]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w17
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    asr x12, x12, #37
+; CHECK-SD-NEXT:    smull x14, w16, w8
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    mov v2.h[5], w13
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    smov w13, v1.h[6]
+; CHECK-SD-NEXT:    mov v3.h[5], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w15
+; CHECK-SD-NEXT:    asr x14, x14, #37
+; CHECK-SD-NEXT:    msub w10, w12, w9, w13
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    smov w12, v0.h[7]
+; CHECK-SD-NEXT:    add w13, w14, w14, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.h[7]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    mov v2.h[6], w11
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    msub w11, w13, w9, w12
+; CHECK-SD-NEXT:    msub w8, w8, w9, w14
+; CHECK-SD-NEXT:    mov v2.h[7], w11
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    movi v16.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w17, s3
+; CHECK-GI-NEXT:    mov w18, v3.s[1]
+; CHECK-GI-NEXT:    mov w0, v3.s[2]
+; CHECK-GI-NEXT:    mov w1, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[2]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    mls v1.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <16 x i16> %s
+}
+
+define <2 x i16> @uv2i16_7(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: uv2i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    neg v3.4h, v3.4h
+; CHECK-GI-NEXT:    ushr v2.2s, v1.2s, #16
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
+; CHECK-GI-NEXT:    ushl v2.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    usra v2.2s, v1.2s, #16
+; CHECK-GI-NEXT:    uzp1 v1.4h, v2.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    dup v2.2s, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i16> %d, <i16 7, i16 7>
+  ret <2 x i16> %s
+}
+
+define <2 x i16> @uv2i16_100(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: uv2i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    ushl v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    dup v2.2s, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i16> %d, <i16 100, i16 100>
+  ret <2 x i16> %s
+}
+
+define <3 x i16> @uv3i16_7(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: uv3i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    add w8, w12, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov w9, #16 // =0x10
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    mov w9, #2 // =0x2
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sub v4.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    neg v3.4h, v3.4h
+; CHECK-GI-NEXT:    ushl v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NEXT:    add v1.4h, v3.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v4.4h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i16> %d, <i16 7, i16 7, i16 7>
+  ret <3 x i16> %s
+}
+
+define <3 x i16> @uv3i16_100(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: uv3i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    msub w9, w11, w14, w9
+; CHECK-SD-NEXT:    lsr x11, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w10, w11, w14, w10
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    msub w8, w8, w14, w12
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov w11, #5243 // =0x147b
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w11
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    umov w10, v1.h[2]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov w8, #16 // =0x10
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w10
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    neg v3.4h, v4.4h
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i16> %d, <i16 100, i16 100, i16 100>
+  ret <3 x i16> %s
+}
+
+define <4 x i16> @uv4i16_7(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: uv4i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-SD-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    shrn v2.4h, v2.4s, #1
+; CHECK-SD-NEXT:    add v1.4h, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.4h, #7
+; CHECK-SD-NEXT:    ushr v1.4h, v1.4h, #2
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI58_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI58_0]
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    usra v1.4h, v2.4h, #1
+; CHECK-GI-NEXT:    movi v2.4h, #7
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #2
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
+  ret <4 x i16> %s
+}
+
+define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: uv4i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.4h, v0.4h, #2
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.4h, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #17
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI59_0
+; CHECK-GI-NEXT:    ushr v1.4h, v0.4h, #2
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.4h, #100
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #1
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @uv8i16_7(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: uv8i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI60_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @uv8i16_100(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: uv8i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v3.4s, v2.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI61_0
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT:    umull2 v3.4s, v1.8h, v2.8h
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.8h, #100
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %s
+}
+
+define <16 x i16> @uv16i16_7(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: uv16i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w9, v0.h[1]
+; CHECK-SD-NEXT:    umov w10, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    umov w12, v1.h[0]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w11, v1.h[1]
+; CHECK-SD-NEXT:    umov w17, v0.h[2]
+; CHECK-SD-NEXT:    umov w18, v1.h[2]
+; CHECK-SD-NEXT:    umov w0, v0.h[3]
+; CHECK-SD-NEXT:    umov w1, v1.h[3]
+; CHECK-SD-NEXT:    umull x13, w9, w8
+; CHECK-SD-NEXT:    umull x14, w10, w8
+; CHECK-SD-NEXT:    umull x16, w12, w8
+; CHECK-SD-NEXT:    umull x15, w11, w8
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w13
+; CHECK-SD-NEXT:    umull x13, w17, w8
+; CHECK-SD-NEXT:    add w10, w10, w14
+; CHECK-SD-NEXT:    umull x14, w18, w8
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w12, w12, w16
+; CHECK-SD-NEXT:    fmov s2, w10
+; CHECK-SD-NEXT:    umov w16, v1.h[4]
+; CHECK-SD-NEXT:    fmov s3, w12
+; CHECK-SD-NEXT:    add w11, w11, w15
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    umov w15, v0.h[4]
+; CHECK-SD-NEXT:    umull x10, w0, w8
+; CHECK-SD-NEXT:    umull x12, w1, w8
+; CHECK-SD-NEXT:    mov v2.h[1], w9
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[1], w11
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    umov w9, v0.h[5]
+; CHECK-SD-NEXT:    add w13, w17, w13
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umov w11, v1.h[5]
+; CHECK-SD-NEXT:    add w14, w18, w14
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    umull x17, w15, w8
+; CHECK-SD-NEXT:    umull x18, w16, w8
+; CHECK-SD-NEXT:    mov v2.h[2], w13
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[2], w14
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    umov w13, v0.h[6]
+; CHECK-SD-NEXT:    lsr x14, x17, #32
+; CHECK-SD-NEXT:    add w10, w0, w10
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    add w12, w1, w12
+; CHECK-SD-NEXT:    umull x0, w11, w8
+; CHECK-SD-NEXT:    mov v2.h[3], w10
+; CHECK-SD-NEXT:    umov w10, v1.h[6]
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[3], w12
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    add w14, w15, w14
+; CHECK-SD-NEXT:    umov w12, v0.h[7]
+; CHECK-SD-NEXT:    add w15, w16, w18
+; CHECK-SD-NEXT:    lsr x18, x0, #32
+; CHECK-SD-NEXT:    umov w16, v1.h[7]
+; CHECK-SD-NEXT:    mov v2.h[4], w14
+; CHECK-SD-NEXT:    umull x14, w13, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[4], w15
+; CHECK-SD-NEXT:    umull x15, w10, w8
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    add w11, w11, w18
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    mov v2.h[5], w9
+; CHECK-SD-NEXT:    umull x9, w12, w8
+; CHECK-SD-NEXT:    mov v3.h[5], w11
+; CHECK-SD-NEXT:    umull x8, w16, w8
+; CHECK-SD-NEXT:    sub w11, w14, w14, lsl #3
+; CHECK-SD-NEXT:    sub w14, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w11, w13, w11
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    add w10, w10, w14
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v2.h[6], w11
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w9, w12, w9
+; CHECK-SD-NEXT:    add w8, w16, w8
+; CHECK-SD-NEXT:    mov v2.h[7], w9
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv16i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI62_0
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT:    umull v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    umull v6.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umull v3.4s, v3.4h, v2.4h
+; CHECK-GI-NEXT:    umull v2.4s, v5.4h, v2.4h
+; CHECK-GI-NEXT:    shrn v4.4h, v4.4s, #16
+; CHECK-GI-NEXT:    shrn v5.4h, v6.4s, #16
+; CHECK-GI-NEXT:    mov v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.16b, v5.16b
+; CHECK-GI-NEXT:    shrn2 v4.8h, v3.4s, #16
+; CHECK-GI-NEXT:    shrn2 v5.8h, v2.4s, #16
+; CHECK-GI-NEXT:    shrn2 v6.8h, v3.4s, #16
+; CHECK-GI-NEXT:    shrn2 v7.8h, v2.4s, #16
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    sub v6.8h, v0.8h, v6.8h
+; CHECK-GI-NEXT:    sub v7.8h, v1.8h, v7.8h
+; CHECK-GI-NEXT:    usra v4.8h, v6.8h, #1
+; CHECK-GI-NEXT:    usra v5.8h, v7.8h, #1
+; CHECK-GI-NEXT:    ushr v3.8h, v4.8h, #2
+; CHECK-GI-NEXT:    ushr v4.8h, v5.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v3.8h, v2.8h
+; CHECK-GI-NEXT:    mls v1.8h, v4.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <16 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %s
+}
+
+define <16 x i16> @uv16i16_100(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: uv16i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w11, v0.h[0]
+; CHECK-SD-NEXT:    umov w14, v1.h[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w13, v1.h[1]
+; CHECK-SD-NEXT:    umov w0, v1.h[2]
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    umull x1, w14, w8
+; CHECK-SD-NEXT:    umull x17, w12, w8
+; CHECK-SD-NEXT:    umull x15, w10, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umull x18, w13, w8
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    msub w11, w16, w9, w11
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    msub w14, w1, w9, w14
+; CHECK-SD-NEXT:    lsr x16, x18, #32
+; CHECK-SD-NEXT:    msub w12, w17, w9, w12
+; CHECK-SD-NEXT:    umov w17, v1.h[3]
+; CHECK-SD-NEXT:    msub w10, w15, w9, w10
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    fmov s2, w11
+; CHECK-SD-NEXT:    umull x18, w0, w8
+; CHECK-SD-NEXT:    fmov s3, w14
+; CHECK-SD-NEXT:    msub w13, w16, w9, w13
+; CHECK-SD-NEXT:    umov w16, v0.h[4]
+; CHECK-SD-NEXT:    umull x11, w17, w8
+; CHECK-SD-NEXT:    mov v2.h[1], w10
+; CHECK-SD-NEXT:    umov w10, v1.h[4]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    umull x1, w15, w8
+; CHECK-SD-NEXT:    mov v3.h[1], w13
+; CHECK-SD-NEXT:    umov w13, v0.h[5]
+; CHECK-SD-NEXT:    msub w18, w18, w9, w0
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x14, x1, #32
+; CHECK-SD-NEXT:    umull x0, w16, w8
+; CHECK-SD-NEXT:    mov v2.h[2], w12
+; CHECK-SD-NEXT:    msub w11, w11, w9, w17
+; CHECK-SD-NEXT:    umov w17, v1.h[5]
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    mov v3.h[2], w18
+; CHECK-SD-NEXT:    msub w14, w14, w9, w15
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w15, v0.h[6]
+; CHECK-SD-NEXT:    umull x18, w13, w8
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    msub w16, w0, w9, w16
+; CHECK-SD-NEXT:    umov w0, v1.h[6]
+; CHECK-SD-NEXT:    mov v2.h[3], w14
+; CHECK-SD-NEXT:    mov v3.h[3], w11
+; CHECK-SD-NEXT:    umull x11, w17, w8
+; CHECK-SD-NEXT:    msub w10, w12, w9, w10
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    umov w12, v0.h[7]
+; CHECK-SD-NEXT:    umull x14, w15, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    msub w13, w18, w9, w13
+; CHECK-SD-NEXT:    mov v2.h[4], w16
+; CHECK-SD-NEXT:    mov v3.h[4], w10
+; CHECK-SD-NEXT:    umull x10, w0, w8
+; CHECK-SD-NEXT:    umov w16, v1.h[7]
+; CHECK-SD-NEXT:    msub w11, w11, w9, w17
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    msub w14, w14, w9, w15
+; CHECK-SD-NEXT:    umull x15, w12, w8
+; CHECK-SD-NEXT:    mov v2.h[5], w13
+; CHECK-SD-NEXT:    mov v3.h[5], w11
+; CHECK-SD-NEXT:    umull x8, w16, w8
+; CHECK-SD-NEXT:    msub w10, w10, w9, w0
+; CHECK-SD-NEXT:    lsr x11, x15, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v2.h[6], w14
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w12
+; CHECK-SD-NEXT:    msub w8, w8, w9, w16
+; CHECK-SD-NEXT:    mov v2.h[7], w11
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv16i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ushr v3.8h, v1.8h, #2
+; CHECK-GI-NEXT:    adrp x8, .LCPI63_0
+; CHECK-GI-NEXT:    ldr d4, [x8, :lo12:.LCPI63_0]
+; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    mov d6, v3.d[1]
+; CHECK-GI-NEXT:    umull v2.4s, v2.4h, v4.4h
+; CHECK-GI-NEXT:    umull v3.4s, v3.4h, v4.4h
+; CHECK-GI-NEXT:    umull v5.4s, v5.4h, v4.4h
+; CHECK-GI-NEXT:    umull v4.4s, v6.4h, v4.4h
+; CHECK-GI-NEXT:    shrn v2.4h, v2.4s, #16
+; CHECK-GI-NEXT:    shrn v3.4h, v3.4s, #16
+; CHECK-GI-NEXT:    shrn2 v2.8h, v5.4s, #16
+; CHECK-GI-NEXT:    shrn2 v3.8h, v4.4s, #16
+; CHECK-GI-NEXT:    movi v4.8h, #100
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #1
+; CHECK-GI-NEXT:    ushr v3.8h, v3.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v2.8h, v4.8h
+; CHECK-GI-NEXT:    mls v1.8h, v3.8h, v4.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <16 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <16 x i16> %s
+}
+
+define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: sv2i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-SD-NEXT:    usra v2.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i32> %d, <i32 7, i32 7>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: sv2i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i32> %d, <i32 100, i32 100>
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: sv3i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w8, w8, w9
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    asr w10, w8, #2
+; CHECK-SD-NEXT:    add w8, w10, w8, lsr #31
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-SD-NEXT:    add w8, w9, w8
+; CHECK-SD-NEXT:    usra v2.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w9, w9, w10
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w8, w13, w8
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w10, w15, w12
+; CHECK-GI-NEXT:    sub w10, w11, w10
+; CHECK-GI-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w8, w13, w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i32> %d, <i32 7, i32 7, i32 7>
+  ret <3 x i32> %s
+}
+
+define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: sv3i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    mov w10, #100 // =0x64
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    msub w8, w8, w10, w9
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    msub w9, w10, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w14, w13, w8
+; CHECK-GI-NEXT:    msub w10, w12, w8, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NEXT:    msub w8, w14, w8, w13
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i32> %d, <i32 100, i32 100, i32 100>
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: sv4i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.4s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    sshr v2.4s, v1.4s, #2
+; CHECK-SD-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: sv4i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v3.4s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sshr v2.4s, v1.4s, #5
+; CHECK-SD-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @sv8i32_7(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: sv8i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v6.4s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    smull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    smull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    smull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    smull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    add v3.4s, v3.4s, v0.4s
+; CHECK-SD-NEXT:    add v2.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    sshr v4.4s, v3.4s, #2
+; CHECK-SD-NEXT:    sshr v5.4s, v2.4s, #2
+; CHECK-SD-NEXT:    usra v4.4s, v3.4s, #31
+; CHECK-SD-NEXT:    usra v5.4s, v2.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v4.4s, v6.4s
+; CHECK-SD-NEXT:    mls v1.4s, v5.4s, v6.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    fmov w13, s1
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w14, v1.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w15, v1.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov w16, v1.s[3]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    movi v4.4s, #7
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i32> %d, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %s
+}
+
+define <8 x i32> @sv8i32_100(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: sv8i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v6.4s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    smull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    smull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    smull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    smull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    sshr v4.4s, v3.4s, #5
+; CHECK-SD-NEXT:    sshr v5.4s, v2.4s, #5
+; CHECK-SD-NEXT:    usra v4.4s, v3.4s, #31
+; CHECK-SD-NEXT:    usra v5.4s, v2.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v4.4s, v6.4s
+; CHECK-SD-NEXT:    mls v1.4s, v5.4s, v6.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    fmov w13, s1
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w14, v1.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w15, v1.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov w16, v1.s[3]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    movi v4.4s, #100
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i32> %d, <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+  ret <8 x i32> %s
+}
+
+define <2 x i32> @uv2i32_7(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: uv2i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
+; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI72_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI72_0]
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    usra v1.2s, v2.2s, #1
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i32> %d, <i32 7, i32 7>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: uv2i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI73_0
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI73_0]
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #5
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i32> %d, <i32 100, i32 100>
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: uv3i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w10, w9, w8
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    add w8, w8, w10, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    add w8, w9, w8
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
+; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI74_0
+; CHECK-GI-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI74_0]
+; CHECK-GI-NEXT:    mov w8, v0.s[2]
+; CHECK-GI-NEXT:    movk w9, #9362, lsl #16
+; CHECK-GI-NEXT:    mov w10, #1 // =0x1
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    umull x8, w8, w9
+; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov w9, #2 // =0x2
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w10
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    sub v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i32> %d, <i32 7, i32 7, i32 7>
+  ret <3 x i32> %s
+}
+
+define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: uv3i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    mov w10, #100 // =0x64
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w8, w8, w10, w9
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI75_0
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI75_0]
+; CHECK-GI-NEXT:    mov w8, #5 // =0x5
+; CHECK-GI-NEXT:    mov w10, #34079 // =0x851f
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    movk w10, #20971, lsl #16
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    umull x9, w9, w10
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    lsr x8, x9, #32
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
+; CHECK-GI-NEXT:    neg v3.4s, v3.4s
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    fmov s1, w11
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w10
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i32> %d, <i32 100, i32 100, i32 100>
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @uv4i32_7(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: uv4i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-SD-NEXT:    movi v2.4s, #7
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI76_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI76_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @uv4i32_100(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: uv4i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    movi v2.4s, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI77_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI77_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @uv8i32_7(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: uv8i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    umull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    umull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    umull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    sub v4.4s, v0.4s, v3.4s
+; CHECK-SD-NEXT:    sub v5.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    usra v3.4s, v4.4s, #1
+; CHECK-SD-NEXT:    movi v4.4s, #7
+; CHECK-SD-NEXT:    usra v2.4s, v5.4s, #1
+; CHECK-SD-NEXT:    ushr v3.4s, v3.4s, #2
+; CHECK-SD-NEXT:    ushr v2.4s, v2.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI78_0
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI78_0]
+; CHECK-GI-NEXT:    umull v4.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    umull v6.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umull v3.2d, v3.2s, v2.2s
+; CHECK-GI-NEXT:    umull v2.2d, v5.2s, v2.2s
+; CHECK-GI-NEXT:    shrn v4.2s, v4.2d, #32
+; CHECK-GI-NEXT:    shrn v5.2s, v6.2d, #32
+; CHECK-GI-NEXT:    mov v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.16b, v5.16b
+; CHECK-GI-NEXT:    shrn2 v4.4s, v3.2d, #32
+; CHECK-GI-NEXT:    shrn2 v5.4s, v2.2d, #32
+; CHECK-GI-NEXT:    shrn2 v6.4s, v3.2d, #32
+; CHECK-GI-NEXT:    shrn2 v7.4s, v2.2d, #32
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    sub v6.4s, v0.4s, v6.4s
+; CHECK-GI-NEXT:    sub v7.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT:    usra v4.4s, v6.4s, #1
+; CHECK-GI-NEXT:    usra v5.4s, v7.4s, #1
+; CHECK-GI-NEXT:    ushr v3.4s, v4.4s, #2
+; CHECK-GI-NEXT:    ushr v4.4s, v5.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i32> %d, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %s
+}
+
+define <8 x i32> @uv8i32_100(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: uv8i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    umull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    umull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    umull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    movi v4.4s, #100
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    ushr v3.4s, v3.4s, #5
+; CHECK-SD-NEXT:    ushr v2.4s, v2.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI79_0
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI79_0]
+; CHECK-GI-NEXT:    umull v5.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    umull v6.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umull v3.2d, v3.2s, v2.2s
+; CHECK-GI-NEXT:    umull v2.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    shrn v4.2s, v5.2d, #32
+; CHECK-GI-NEXT:    shrn v5.2s, v6.2d, #32
+; CHECK-GI-NEXT:    shrn2 v4.4s, v3.2d, #32
+; CHECK-GI-NEXT:    shrn2 v5.4s, v2.2d, #32
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v3.4s, v4.4s, #5
+; CHECK-GI-NEXT:    ushr v4.4s, v5.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i32> %d, <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+  ret <8 x i32> %s
+}
+
+define <2 x i64> @sv2i64_7(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: sv2i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x10, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    asr x12, x11, #1
+; CHECK-SD-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-SD-NEXT:    asr x13, x8, #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    add x8, x13, x8, lsr #63
+; CHECK-SD-NEXT:    add x10, x10, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    add x8, x9, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x8, x10, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI80_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI80_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i64> %d, <i64 7, i64 7>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @sv2i64_100(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: sv2i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x10, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    add x11, x11, x10
+; CHECK-SD-NEXT:    asr x12, x11, #6
+; CHECK-SD-NEXT:    add x8, x8, x9
+; CHECK-SD-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-SD-NEXT:    asr x13, x8, #6
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    msub x10, x11, x12, x10
+; CHECK-SD-NEXT:    add x8, x13, x8, lsr #63
+; CHECK-SD-NEXT:    msub x8, x8, x12, x9
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x8, x10, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI81_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI81_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i64> %d, <i64 100, i64 100>
+  ret <2 x i64> %s
+}
+
+define <3 x i64> @sv3i64_7(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: sv3i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x10, x9, x8
+; CHECK-SD-NEXT:    smulh x12, x11, x8
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    asr x14, x10, #1
+; CHECK-SD-NEXT:    asr x15, x12, #1
+; CHECK-SD-NEXT:    add x10, x14, x10, lsr #63
+; CHECK-SD-NEXT:    asr x16, x8, #1
+; CHECK-SD-NEXT:    add x12, x15, x12, lsr #63
+; CHECK-SD-NEXT:    sub x10, x10, x10, lsl #3
+; CHECK-SD-NEXT:    add x8, x16, x8, lsr #63
+; CHECK-SD-NEXT:    sub x12, x12, x12, lsl #3
+; CHECK-SD-NEXT:    add x9, x9, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x11, x12
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI82_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI82_0]
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x13, d4
+; CHECK-GI-NEXT:    mov x11, v4.d[1]
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sdiv x8, x9, x8
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x10, v3.d[1]
+; CHECK-GI-NEXT:    mul x12, x12, x13
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov d2, x12
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    lsl x10, x8, #3
+; CHECK-GI-NEXT:    sub x8, x10, x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i64> %d, <i64 7, i64 7, i64 7>
+  ret <3 x i64> %s
+}
+
+define <3 x i64> @sv3i64_100(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: sv3i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x10, x9, x8
+; CHECK-SD-NEXT:    smulh x12, x11, x8
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    add x10, x10, x9
+; CHECK-SD-NEXT:    asr x14, x10, #6
+; CHECK-SD-NEXT:    add x12, x12, x11
+; CHECK-SD-NEXT:    add x10, x14, x10, lsr #63
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    asr x15, x12, #6
+; CHECK-SD-NEXT:    add x8, x8, x13
+; CHECK-SD-NEXT:    msub x9, x10, x14, x9
+; CHECK-SD-NEXT:    asr x10, x8, #6
+; CHECK-SD-NEXT:    add x12, x15, x12, lsr #63
+; CHECK-SD-NEXT:    add x8, x10, x8, lsr #63
+; CHECK-SD-NEXT:    msub x10, x12, x14, x11
+; CHECK-SD-NEXT:    msub x8, x8, x14, x13
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI83_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI83_0]
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x14, d4
+; CHECK-GI-NEXT:    mov x12, v4.d[1]
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sdiv x10, x9, x8
+; CHECK-GI-NEXT:    fmov x13, d3
+; CHECK-GI-NEXT:    mov x11, v3.d[1]
+; CHECK-GI-NEXT:    mul x13, x13, x14
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    fmov d2, x13
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    msub x8, x10, x8, x9
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i64> %d, <i64 100, i64 100, i64 100>
+  ret <3 x i64> %s
+}
+
+define <4 x i64> @sv4i64_7(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: sv4i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x9, x8
+; CHECK-SD-NEXT:    smulh x14, x12, x8
+; CHECK-SD-NEXT:    smulh x15, x10, x8
+; CHECK-SD-NEXT:    asr x16, x11, #1
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    asr x17, x14, #1
+; CHECK-SD-NEXT:    add x11, x16, x11, lsr #63
+; CHECK-SD-NEXT:    add x14, x17, x14, lsr #63
+; CHECK-SD-NEXT:    asr x18, x15, #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    asr x0, x8, #1
+; CHECK-SD-NEXT:    sub x14, x14, x14, lsl #3
+; CHECK-SD-NEXT:    add x15, x18, x15, lsr #63
+; CHECK-SD-NEXT:    add x9, x9, x11
+; CHECK-SD-NEXT:    add x8, x0, x8, lsr #63
+; CHECK-SD-NEXT:    add x11, x12, x14
+; CHECK-SD-NEXT:    sub x15, x15, x15, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    fmov d1, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x10, x15
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x12, v1.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x11, x11, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI84_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI84_0]
+; CHECK-GI-NEXT:    fmov x9, d4
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov x11, v4.d[1]
+; CHECK-GI-NEXT:    sdiv x8, x12, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov x10, v2.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x12, x9
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mul x11, x13, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i64> %d, <i64 7, i64 7, i64 7, i64 7>
+  ret <4 x i64> %s
+}
+
+define <4 x i64> @sv4i64_100(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: sv4i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x9, x8
+; CHECK-SD-NEXT:    smulh x14, x12, x8
+; CHECK-SD-NEXT:    smulh x15, x10, x8
+; CHECK-SD-NEXT:    add x11, x11, x9
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    asr x16, x11, #6
+; CHECK-SD-NEXT:    add x14, x14, x12
+; CHECK-SD-NEXT:    asr x17, x14, #6
+; CHECK-SD-NEXT:    add x11, x16, x11, lsr #63
+; CHECK-SD-NEXT:    mov w16, #100 // =0x64
+; CHECK-SD-NEXT:    add x15, x15, x10
+; CHECK-SD-NEXT:    add x14, x17, x14, lsr #63
+; CHECK-SD-NEXT:    msub x9, x11, x16, x9
+; CHECK-SD-NEXT:    asr x11, x15, #6
+; CHECK-SD-NEXT:    add x8, x8, x13
+; CHECK-SD-NEXT:    msub x12, x14, x16, x12
+; CHECK-SD-NEXT:    asr x14, x8, #6
+; CHECK-SD-NEXT:    add x11, x11, x15, lsr #63
+; CHECK-SD-NEXT:    add x8, x14, x8, lsr #63
+; CHECK-SD-NEXT:    msub x10, x11, x16, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    msub x8, x8, x16, x13
+; CHECK-SD-NEXT:    fmov d1, x12
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x12, v1.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x11, x11, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI85_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI85_0]
+; CHECK-GI-NEXT:    fmov x9, d4
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov x11, v4.d[1]
+; CHECK-GI-NEXT:    sdiv x8, x12, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov x10, v2.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x12, x9
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mul x11, x13, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i64> %d, <i64 100, i64 100, i64 100, i64 100>
+  ret <4 x i64> %s
+}
+
+define <2 x i64> @uv2i64_7(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: uv2i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x11, x10, x8
+; CHECK-SD-NEXT:    umulh x8, x9, x8
+; CHECK-SD-NEXT:    sub x12, x10, x11
+; CHECK-SD-NEXT:    add x11, x11, x12, lsr #1
+; CHECK-SD-NEXT:    sub x12, x9, x8
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    add x8, x8, x12, lsr #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    add x10, x10, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    add x8, x9, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x9, v0.d[1]
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    umulh x8, x9, x8
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
+; CHECK-GI-NEXT:    sub v2.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    usra v1.2d, v2.2d, #1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI86_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i64> %d, <i64 7, i64 7>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @uv2i64_100(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: uv2i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #23592, lsl #16
+; CHECK-SD-NEXT:    movk x8, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x11, x10, #2
+; CHECK-SD-NEXT:    movk x8, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x12, x9, #2
+; CHECK-SD-NEXT:    umulh x11, x11, x8
+; CHECK-SD-NEXT:    umulh x8, x12, x8
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    msub x10, x11, x12, x10
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    msub x8, x8, x12, x9
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.2d, v0.2d, #2
+; CHECK-GI-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    umulh x8, x9, x8
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI87_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI87_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i64> %d, <i64 100, i64 100>
+  ret <2 x i64> %s
+}
+
+define <3 x i64> @uv3i64_7(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: uv3i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x10, x9, x8
+; CHECK-SD-NEXT:    umulh x12, x11, x8
+; CHECK-SD-NEXT:    umulh x8, x13, x8
+; CHECK-SD-NEXT:    sub x14, x9, x10
+; CHECK-SD-NEXT:    add x10, x10, x14, lsr #1
+; CHECK-SD-NEXT:    sub x15, x11, x12
+; CHECK-SD-NEXT:    add x12, x12, x15, lsr #1
+; CHECK-SD-NEXT:    lsr x10, x10, #2
+; CHECK-SD-NEXT:    sub x16, x13, x8
+; CHECK-SD-NEXT:    add x8, x8, x16, lsr #1
+; CHECK-SD-NEXT:    lsr x12, x12, #2
+; CHECK-SD-NEXT:    sub x10, x10, x10, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    sub x12, x12, x12, lsl #3
+; CHECK-SD-NEXT:    add x9, x9, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x11, x12
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    mov v4.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x9, x9, x8
+; CHECK-GI-NEXT:    mov v4.d[1], v1.d[0]
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    fmov d5, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI88_0
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    mov v5.d[1], x10
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    ldr q3, [x9, :lo12:.LCPI88_0]
+; CHECK-GI-NEXT:    mov x10, v3.d[1]
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    usra v5.2d, v0.2d, #1
+; CHECK-GI-NEXT:    ushr v0.2d, v5.2d, #2
+; CHECK-GI-NEXT:    mov x9, v0.d[1]
+; CHECK-GI-NEXT:    fmov x11, d0
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    umulh x8, x10, x8
+; CHECK-GI-NEXT:    fmov d0, x11
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    sub x9, x10, x8
+; CHECK-GI-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-GI-NEXT:    sub v0.2d, v4.2d, v0.2d
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x8, x10, x8
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i64> %d, <i64 7, i64 7, i64 7>
+  ret <3 x i64> %s
+}
+
+define <3 x i64> @uv3i64_100(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: uv3i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    mov x10, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x10, #23592, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x10, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x9, x8, #2
+; CHECK-SD-NEXT:    movk x10, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x12, x11, #2
+; CHECK-SD-NEXT:    lsr x14, x13, #2
+; CHECK-SD-NEXT:    umulh x9, x9, x10
+; CHECK-SD-NEXT:    umulh x12, x12, x10
+; CHECK-SD-NEXT:    umulh x10, x14, x10
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    lsr x9, x9, #2
+; CHECK-SD-NEXT:    msub x8, x9, x14, x8
+; CHECK-SD-NEXT:    lsr x9, x12, #2
+; CHECK-SD-NEXT:    lsr x10, x10, #2
+; CHECK-SD-NEXT:    msub x9, x9, x14, x11
+; CHECK-SD-NEXT:    msub x10, x10, x14, x13
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    fmov d1, x9
+; CHECK-SD-NEXT:    fmov d2, x10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v3.16b, v0.16b
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov x9, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x9, #23592, lsl #16
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    movk x9, #49807, lsl #32
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    movk x9, #10485, lsl #48
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    mov x8, v3.d[1]
+; CHECK-GI-NEXT:    umulh x10, x10, x9
+; CHECK-GI-NEXT:    umulh x8, x8, x9
+; CHECK-GI-NEXT:    fmov d3, x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI89_0
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI89_0]
+; CHECK-GI-NEXT:    mov x10, v4.d[1]
+; CHECK-GI-NEXT:    fmov x12, d4
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    mov x8, v3.d[1]
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mul x8, x8, x10
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    lsr x12, x10, #2
+; CHECK-GI-NEXT:    fmov d2, x11
+; CHECK-GI-NEXT:    umulh x9, x12, x9
+; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    lsr x8, x9, #2
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    msub x8, x8, x9, x10
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i64> %d, <i64 100, i64 100, i64 100>
+  ret <3 x i64> %s
+}
+
+define <4 x i64> @uv4i64_7(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: uv4i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x11, x9, x8
+; CHECK-SD-NEXT:    umulh x14, x12, x8
+; CHECK-SD-NEXT:    umulh x15, x10, x8
+; CHECK-SD-NEXT:    sub x16, x9, x11
+; CHECK-SD-NEXT:    umulh x8, x13, x8
+; CHECK-SD-NEXT:    add x11, x11, x16, lsr #1
+; CHECK-SD-NEXT:    sub x17, x12, x14
+; CHECK-SD-NEXT:    add x14, x14, x17, lsr #1
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    sub x16, x10, x15
+; CHECK-SD-NEXT:    add x15, x15, x16, lsr #1
+; CHECK-SD-NEXT:    lsr x14, x14, #2
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    sub x16, x13, x8
+; CHECK-SD-NEXT:    add x8, x8, x16, lsr #1
+; CHECK-SD-NEXT:    sub x14, x14, x14, lsl #3
+; CHECK-SD-NEXT:    lsr x15, x15, #2
+; CHECK-SD-NEXT:    add x9, x9, x11
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    add x11, x12, x14
+; CHECK-SD-NEXT:    sub x15, x15, x15, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    fmov d1, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x10, x15
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x12, v1.d[1]
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x9, x9, x8
+; CHECK-GI-NEXT:    umulh x11, x11, x8
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    umulh x8, x12, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI90_0
+; CHECK-GI-NEXT:    sub v4.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v5.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    usra v2.2d, v4.2d, #1
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI90_0]
+; CHECK-GI-NEXT:    usra v3.2d, v5.2d, #1
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    mov x10, v4.d[1]
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #2
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    mul x8, x12, x8
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    mul x10, x13, x10
+; CHECK-GI-NEXT:    fmov d3, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i64> %d, <i64 7, i64 7, i64 7, i64 7>
+  ret <4 x i64> %s
+}
+
+define <4 x i64> @uv4i64_100(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: uv4i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #23592, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x11, x9, #2
+; CHECK-SD-NEXT:    movk x8, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x14, x12, #2
+; CHECK-SD-NEXT:    umulh x11, x11, x8
+; CHECK-SD-NEXT:    lsr x15, x10, #2
+; CHECK-SD-NEXT:    lsr x16, x13, #2
+; CHECK-SD-NEXT:    umulh x14, x14, x8
+; CHECK-SD-NEXT:    umulh x15, x15, x8
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    umulh x8, x16, x8
+; CHECK-SD-NEXT:    mov w16, #100 // =0x64
+; CHECK-SD-NEXT:    msub x9, x11, x16, x9
+; CHECK-SD-NEXT:    lsr x11, x14, #2
+; CHECK-SD-NEXT:    msub x11, x11, x16, x12
+; CHECK-SD-NEXT:    lsr x12, x15, #2
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    msub x10, x12, x16, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    msub x8, x8, x16, x13
+; CHECK-SD-NEXT:    fmov d1, x11
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.2d, v0.2d, #2
+; CHECK-GI-NEXT:    ushr v3.2d, v1.2d, #2
+; CHECK-GI-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mov x10, v2.d[1]
+; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    umulh x9, x9, x8
+; CHECK-GI-NEXT:    umulh x11, x11, x8
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    umulh x8, x12, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI91_0
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI91_0]
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #2
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    mov x10, v4.d[1]
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    mul x8, x12, x8
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    mul x10, x13, x10
+; CHECK-GI-NEXT:    fmov d3, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i64> %d, <i64 100, i64 100, i64 100, i64 100>
+  ret <4 x i64> %s
+}
+
+define <2 x i128> @sv2i128_7(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: sv2i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i128> %d, <i128 7, i128 7>
+  ret <2 x i128> %s
+}
+
+define <2 x i128> @sv2i128_100(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: sv2i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i128> %d, <i128 100, i128 100>
+  ret <2 x i128> %s
+}
+
+define <3 x i128> @sv3i128_7(<3 x i128> %d, <3 x i128> %e) {
+; CHECK-SD-LABEL: sv3i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w30, -64
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x23
+; CHECK-SD-NEXT:    mov x1, x24
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w30, -64
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x23, x0
+; CHECK-GI-NEXT:    mov x24, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x4, x0
+; CHECK-GI-NEXT:    mov x5, x1
+; CHECK-GI-NEXT:    mov x0, x23
+; CHECK-GI-NEXT:    mov x1, x24
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i128> %d, <i128 7, i128 7, i128 7>
+  ret <3 x i128> %s
+}
+
+define <3 x i128> @sv3i128_100(<3 x i128> %d, <3 x i128> %e) {
+; CHECK-SD-LABEL: sv3i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w30, -64
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x23
+; CHECK-SD-NEXT:    mov x1, x24
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w30, -64
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x23, x0
+; CHECK-GI-NEXT:    mov x24, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x4, x0
+; CHECK-GI-NEXT:    mov x5, x1
+; CHECK-GI-NEXT:    mov x0, x23
+; CHECK-GI-NEXT:    mov x1, x24
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i128> %d, <i128 100, i128 100, i128 100>
+  ret <3 x i128> %s
+}
+
+define <4 x i128> @sv4i128_7(<4 x i128> %d, <4 x i128> %e) {
+; CHECK-SD-LABEL: sv4i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w30, -80
+; CHECK-SD-NEXT:    mov x23, x3
+; CHECK-SD-NEXT:    mov x24, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x7
+; CHECK-SD-NEXT:    mov x20, x6
+; CHECK-SD-NEXT:    mov x21, x5
+; CHECK-SD-NEXT:    mov x22, x4
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x25, x0
+; CHECK-SD-NEXT:    mov x26, x1
+; CHECK-SD-NEXT:    mov x0, x24
+; CHECK-SD-NEXT:    mov x1, x23
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x6, x0
+; CHECK-SD-NEXT:    mov x7, x1
+; CHECK-SD-NEXT:    mov x0, x25
+; CHECK-SD-NEXT:    mov x1, x26
+; CHECK-SD-NEXT:    mov x2, x23
+; CHECK-SD-NEXT:    mov x3, x24
+; CHECK-SD-NEXT:    mov x4, x21
+; CHECK-SD-NEXT:    mov x5, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #80 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w30, -80
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    mov x23, x6
+; CHECK-GI-NEXT:    mov x24, x7
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x25, x0
+; CHECK-GI-NEXT:    mov x26, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x23
+; CHECK-GI-NEXT:    mov x1, x24
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x6, x0
+; CHECK-GI-NEXT:    mov x7, x1
+; CHECK-GI-NEXT:    mov x0, x25
+; CHECK-GI-NEXT:    mov x1, x26
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    mov x4, x21
+; CHECK-GI-NEXT:    mov x5, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #80 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i128> %d, <i128 7, i128 7, i128 7, i128 7>
+  ret <4 x i128> %s
+}
+
+define <4 x i128> @sv4i128_100(<4 x i128> %d, <4 x i128> %e) {
+; CHECK-SD-LABEL: sv4i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w30, -80
+; CHECK-SD-NEXT:    mov x23, x3
+; CHECK-SD-NEXT:    mov x24, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x7
+; CHECK-SD-NEXT:    mov x20, x6
+; CHECK-SD-NEXT:    mov x21, x5
+; CHECK-SD-NEXT:    mov x22, x4
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x25, x0
+; CHECK-SD-NEXT:    mov x26, x1
+; CHECK-SD-NEXT:    mov x0, x24
+; CHECK-SD-NEXT:    mov x1, x23
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x6, x0
+; CHECK-SD-NEXT:    mov x7, x1
+; CHECK-SD-NEXT:    mov x0, x25
+; CHECK-SD-NEXT:    mov x1, x26
+; CHECK-SD-NEXT:    mov x2, x23
+; CHECK-SD-NEXT:    mov x3, x24
+; CHECK-SD-NEXT:    mov x4, x21
+; CHECK-SD-NEXT:    mov x5, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #80 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w30, -80
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    mov x21, x4
+; CHECK-GI-NEXT:    mov x22, x5
+; CHECK-GI-NEXT:    mov x23, x6
+; CHECK-GI-NEXT:    mov x24, x7
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x25, x0
+; CHECK-GI-NEXT:    mov x26, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x23
+; CHECK-GI-NEXT:    mov x1, x24
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x6, x0
+; CHECK-GI-NEXT:    mov x7, x1
+; CHECK-GI-NEXT:    mov x0, x25
+; CHECK-GI-NEXT:    mov x1, x26
+; CHECK-GI-NEXT:    mov x2, x19
+; CHECK-GI-NEXT:    mov x3, x20
+; CHECK-GI-NEXT:    mov x4, x21
+; CHECK-GI-NEXT:    mov x5, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #80 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i128> %d, <i128 100, i128 100, i128 100, i128 100>
+  ret <4 x i128> %s
+}
+
+define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: uv2i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x10, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x8, #9362 // =0x2492
+; CHECK-GI-NEXT:    sub x4, x0, x0
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    umulh x18, x0, xzr
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x11, x1, x10
+; CHECK-GI-NEXT:    mul x12, x0, x8
+; CHECK-GI-NEXT:    umulh x13, x0, x10
+; CHECK-GI-NEXT:    mul x14, x1, x8
+; CHECK-GI-NEXT:    adds x11, x11, x12
+; CHECK-GI-NEXT:    umulh x15, x1, x10
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    cmn x11, x13
+; CHECK-GI-NEXT:    and x11, x12, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    add x14, x14, x4
+; CHECK-GI-NEXT:    and x12, x12, #0x1
+; CHECK-GI-NEXT:    and x4, xzr, #0x1
+; CHECK-GI-NEXT:    mul x13, x3, x10
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    adds x14, x14, x15
+; CHECK-GI-NEXT:    add x12, x12, x4
+; CHECK-GI-NEXT:    mul x5, x2, x8
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    adds x14, x14, x16
+; CHECK-GI-NEXT:    and x16, x4, #0x1
+; CHECK-GI-NEXT:    umulh x9, xzr, x10
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    adds x11, x14, x11
+; CHECK-GI-NEXT:    add x12, x12, x16
+; CHECK-GI-NEXT:    and x16, x4, #0x1
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    umulh x17, x1, x8
+; CHECK-GI-NEXT:    add x12, x12, x16
+; CHECK-GI-NEXT:    adds x13, x13, x5
+; CHECK-GI-NEXT:    umulh x15, x2, x10
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    and x16, x4, #0x1
+; CHECK-GI-NEXT:    mul x6, x3, x8
+; CHECK-GI-NEXT:    umulh x10, x3, x10
+; CHECK-GI-NEXT:    cmn x13, x15
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    add x14, x9, x17
+; CHECK-GI-NEXT:    umulh x15, x2, x8
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    add x13, x14, x18
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    sub x17, x2, x2
+; CHECK-GI-NEXT:    and x18, xzr, #0x1
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    add x14, x16, x14
+; CHECK-GI-NEXT:    add x16, x6, x17
+; CHECK-GI-NEXT:    and x17, xzr, #0x1
+; CHECK-GI-NEXT:    adds x10, x16, x10
+; CHECK-GI-NEXT:    add x17, x17, x18
+; CHECK-GI-NEXT:    cset w16, hs
+; CHECK-GI-NEXT:    adds x10, x10, x15
+; CHECK-GI-NEXT:    umulh x15, x2, xzr
+; CHECK-GI-NEXT:    and x16, x16, #0x1
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x10, x10, x14
+; CHECK-GI-NEXT:    add x16, x17, x16
+; CHECK-GI-NEXT:    and x17, x18, #0x1
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    add x13, x16, x17
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    subs x9, x0, x11
+; CHECK-GI-NEXT:    add x13, x13, x14
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    sbc x14, x1, x12
+; CHECK-GI-NEXT:    add x8, x8, x13
+; CHECK-GI-NEXT:    subs x13, x2, x10
+; CHECK-GI-NEXT:    lsl x15, x14, #63
+; CHECK-GI-NEXT:    sbc x16, x3, x8
+; CHECK-GI-NEXT:    lsr x14, x14, #1
+; CHECK-GI-NEXT:    orr x9, x15, x9, lsr #1
+; CHECK-GI-NEXT:    lsl x15, x16, #63
+; CHECK-GI-NEXT:    orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    lsr x11, x16, #1
+; CHECK-GI-NEXT:    adc x12, x14, x12
+; CHECK-GI-NEXT:    adds x10, x13, x10
+; CHECK-GI-NEXT:    lsl x13, x12, #62
+; CHECK-GI-NEXT:    lsr x12, x12, #2
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsl x11, x8, #62
+; CHECK-GI-NEXT:    orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT:    mov w13, #7 // =0x7
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x14, x12, #3
+; CHECK-GI-NEXT:    orr x10, x11, x10, lsr #2
+; CHECK-GI-NEXT:    umulh x11, x9, x13
+; CHECK-GI-NEXT:    lsl x15, x9, #3
+; CHECK-GI-NEXT:    sub x12, x14, x12
+; CHECK-GI-NEXT:    lsl x16, x8, #3
+; CHECK-GI-NEXT:    umulh x13, x10, x13
+; CHECK-GI-NEXT:    lsl x14, x10, #3
+; CHECK-GI-NEXT:    sub x9, x15, x9
+; CHECK-GI-NEXT:    sub x8, x16, x8
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    sub x10, x14, x10
+; CHECK-GI-NEXT:    add x11, x12, x11
+; CHECK-GI-NEXT:    sbc x1, x1, x11
+; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    add x8, x8, x13
+; CHECK-GI-NEXT:    sbc x3, x3, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i128> %d, <i128 7, i128 7>
+  ret <2 x i128> %s
+}
+
+define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: uv2i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x8, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    sub x18, x0, x0
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mul x11, x1, x10
+; CHECK-GI-NEXT:    mul x12, x0, x8
+; CHECK-GI-NEXT:    umulh x13, x0, x10
+; CHECK-GI-NEXT:    mul x14, x1, x8
+; CHECK-GI-NEXT:    adds x11, x11, x12
+; CHECK-GI-NEXT:    umulh x15, x1, x10
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    cmn x11, x13
+; CHECK-GI-NEXT:    and x11, x12, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    and x12, x12, #0x1
+; CHECK-GI-NEXT:    add x14, x14, x18
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x9, xzr, x10
+; CHECK-GI-NEXT:    adds x14, x14, x15
+; CHECK-GI-NEXT:    and x15, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x17, x1, x8
+; CHECK-GI-NEXT:    cset w4, hs
+; CHECK-GI-NEXT:    add x15, x12, x15
+; CHECK-GI-NEXT:    adds x12, x14, x16
+; CHECK-GI-NEXT:    and x4, x4, #0x1
+; CHECK-GI-NEXT:    mul x18, x3, x10
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    adds x12, x12, x11
+; CHECK-GI-NEXT:    add x11, x15, x4
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    mul x5, x2, x8
+; CHECK-GI-NEXT:    add x11, x11, x14
+; CHECK-GI-NEXT:    and x14, x15, #0x1
+; CHECK-GI-NEXT:    add x17, x9, x17
+; CHECK-GI-NEXT:    add x14, x11, x14
+; CHECK-GI-NEXT:    mov w11, #100 // =0x64
+; CHECK-GI-NEXT:    umulh x13, x0, xzr
+; CHECK-GI-NEXT:    umulh x16, x2, x10
+; CHECK-GI-NEXT:    adds x18, x18, x5
+; CHECK-GI-NEXT:    mul x15, x3, x8
+; CHECK-GI-NEXT:    add x13, x17, x13
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    umulh x10, x3, x10
+; CHECK-GI-NEXT:    add x13, x13, x14
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    cmn x18, x16
+; CHECK-GI-NEXT:    sub x18, x2, x2
+; CHECK-GI-NEXT:    umulh x16, x2, x8
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    and x14, x14, #0x1
+; CHECK-GI-NEXT:    add x15, x15, x18
+; CHECK-GI-NEXT:    and x18, xzr, #0x1
+; CHECK-GI-NEXT:    add x14, x17, x14
+; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    and x17, xzr, #0x1
+; CHECK-GI-NEXT:    adds x10, x15, x10
+; CHECK-GI-NEXT:    add x15, x17, x18
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    umulh x18, x2, xzr
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    adds x10, x10, x16
+; CHECK-GI-NEXT:    lsl x16, x13, #60
+; CHECK-GI-NEXT:    add x15, x15, x17
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    adds x10, x10, x14
+; CHECK-GI-NEXT:    and x14, x17, #0x1
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    add x14, x15, x14
+; CHECK-GI-NEXT:    and x15, x17, #0x1
+; CHECK-GI-NEXT:    orr x12, x16, x12, lsr #4
+; CHECK-GI-NEXT:    add x9, x14, x15
+; CHECK-GI-NEXT:    add x8, x8, x18
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    lsr x9, x13, #4
+; CHECK-GI-NEXT:    umulh x14, x12, x11
+; CHECK-GI-NEXT:    lsl x13, x8, #60
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    mul x12, x12, x11
+; CHECK-GI-NEXT:    orr x10, x13, x10, lsr #4
+; CHECK-GI-NEXT:    madd x9, x9, x11, x14
+; CHECK-GI-NEXT:    umulh x13, x10, x11
+; CHECK-GI-NEXT:    subs x0, x0, x12
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    sbc x1, x1, x9
+; CHECK-GI-NEXT:    madd x8, x8, x11, x13
+; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    sbc x3, x3, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i128> %d, <i128 100, i128 100>
+  ret <2 x i128> %s
+}
+
+define <3 x i128> @uv3i128_7(<3 x i128> %d, <3 x i128> %e) {
+; CHECK-SD-LABEL: uv3i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w30, -64
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x23
+; CHECK-SD-NEXT:    mov x1, x24
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x21, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -32
+; CHECK-GI-NEXT:    mov x10, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x8, #9362 // =0x2492
+; CHECK-GI-NEXT:    and x21, xzr, #0x1
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    umulh x12, x0, xzr
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x11, x1, x10
+; CHECK-GI-NEXT:    mul x14, x0, x8
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    mul x16, x1, x8
+; CHECK-GI-NEXT:    adds x11, x11, x14
+; CHECK-GI-NEXT:    umulh x17, x1, x10
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    cmn x11, x15
+; CHECK-GI-NEXT:    sub x15, x0, x0
+; CHECK-GI-NEXT:    and x19, x19, #0x1
+; CHECK-GI-NEXT:    umulh x18, x0, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    add x15, x16, x15
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    and x16, xzr, #0x1
+; CHECK-GI-NEXT:    mul x6, x3, x10
+; CHECK-GI-NEXT:    add x11, x19, x11
+; CHECK-GI-NEXT:    add x16, x21, x16
+; CHECK-GI-NEXT:    adds x15, x15, x17
+; CHECK-GI-NEXT:    mul x7, x2, x8
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    and x19, x19, #0x1
+; CHECK-GI-NEXT:    adds x15, x15, x18
+; CHECK-GI-NEXT:    umulh x14, x2, x10
+; CHECK-GI-NEXT:    add x16, x16, x19
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    adds x11, x15, x11
+; CHECK-GI-NEXT:    and x15, x19, #0x1
+; CHECK-GI-NEXT:    umulh x9, xzr, x10
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    add x15, x16, x15
+; CHECK-GI-NEXT:    adds x6, x6, x7
+; CHECK-GI-NEXT:    and x16, x19, #0x1
+; CHECK-GI-NEXT:    umulh x13, x1, x8
+; CHECK-GI-NEXT:    cset w7, hs
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    cmn x6, x14
+; CHECK-GI-NEXT:    sub x14, x2, x2
+; CHECK-GI-NEXT:    mul x20, x3, x8
+; CHECK-GI-NEXT:    cset w6, hs
+; CHECK-GI-NEXT:    and x6, x6, #0x1
+; CHECK-GI-NEXT:    umulh x17, x3, x10
+; CHECK-GI-NEXT:    add x13, x9, x13
+; CHECK-GI-NEXT:    umulh x18, x2, x8
+; CHECK-GI-NEXT:    add x13, x13, x12
+; CHECK-GI-NEXT:    and x12, x7, #0x1
+; CHECK-GI-NEXT:    add x14, x20, x14
+; CHECK-GI-NEXT:    and x7, xzr, #0x1
+; CHECK-GI-NEXT:    and x20, xzr, #0x1
+; CHECK-GI-NEXT:    mul x16, x5, x10
+; CHECK-GI-NEXT:    add x7, x7, x20
+; CHECK-GI-NEXT:    add x12, x12, x6
+; CHECK-GI-NEXT:    adds x14, x14, x17
+; CHECK-GI-NEXT:    add x13, x13, x15
+; CHECK-GI-NEXT:    mul x17, x4, x8
+; CHECK-GI-NEXT:    cset w20, hs
+; CHECK-GI-NEXT:    adds x14, x14, x18
+; CHECK-GI-NEXT:    and x20, x20, #0x1
+; CHECK-GI-NEXT:    umulh x18, x4, x10
+; CHECK-GI-NEXT:    add x6, x7, x20
+; CHECK-GI-NEXT:    cset w7, hs
+; CHECK-GI-NEXT:    and x7, x7, #0x1
+; CHECK-GI-NEXT:    adds x12, x14, x12
+; CHECK-GI-NEXT:    umulh x21, x3, x8
+; CHECK-GI-NEXT:    add x15, x6, x7
+; CHECK-GI-NEXT:    cset w6, hs
+; CHECK-GI-NEXT:    adds x16, x16, x17
+; CHECK-GI-NEXT:    and x6, x6, #0x1
+; CHECK-GI-NEXT:    umulh x19, x2, xzr
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    cmn x16, x18
+; CHECK-GI-NEXT:    sub x18, x4, x4
+; CHECK-GI-NEXT:    mul x14, x5, x8
+; CHECK-GI-NEXT:    add x7, x9, x21
+; CHECK-GI-NEXT:    umulh x20, x5, x10
+; CHECK-GI-NEXT:    add x10, x15, x6
+; CHECK-GI-NEXT:    and x6, xzr, #0x1
+; CHECK-GI-NEXT:    add x15, x7, x19
+; CHECK-GI-NEXT:    umulh x16, x4, x8
+; CHECK-GI-NEXT:    add x10, x15, x10
+; CHECK-GI-NEXT:    and x15, x17, #0x1
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    add x14, x14, x18
+; CHECK-GI-NEXT:    and x18, xzr, #0x1
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    umulh x8, x5, x8
+; CHECK-GI-NEXT:    adds x14, x14, x20
+; CHECK-GI-NEXT:    add x15, x15, x17
+; CHECK-GI-NEXT:    add x17, x6, x18
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    adds x14, x14, x16
+; CHECK-GI-NEXT:    umulh x16, x4, xzr
+; CHECK-GI-NEXT:    and x18, x18, #0x1
+; CHECK-GI-NEXT:    cset w6, hs
+; CHECK-GI-NEXT:    adds x14, x14, x15
+; CHECK-GI-NEXT:    add x15, x17, x18
+; CHECK-GI-NEXT:    and x17, x6, #0x1
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    add x15, x15, x17
+; CHECK-GI-NEXT:    and x17, x18, #0x1
+; CHECK-GI-NEXT:    subs x18, x0, x11
+; CHECK-GI-NEXT:    sbc x9, x1, x13
+; CHECK-GI-NEXT:    add x15, x15, x17
+; CHECK-GI-NEXT:    add x8, x8, x16
+; CHECK-GI-NEXT:    subs x16, x2, x12
+; CHECK-GI-NEXT:    lsl x17, x9, #63
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    sbc x15, x3, x10
+; CHECK-GI-NEXT:    subs x6, x4, x14
+; CHECK-GI-NEXT:    orr x17, x17, x18, lsr #1
+; CHECK-GI-NEXT:    lsl x18, x15, #63
+; CHECK-GI-NEXT:    sbc x7, x5, x8
+; CHECK-GI-NEXT:    lsr x9, x9, #1
+; CHECK-GI-NEXT:    lsr x15, x15, #1
+; CHECK-GI-NEXT:    orr x16, x18, x16, lsr #1
+; CHECK-GI-NEXT:    lsl x18, x7, #63
+; CHECK-GI-NEXT:    adds x11, x17, x11
+; CHECK-GI-NEXT:    adc x9, x9, x13
+; CHECK-GI-NEXT:    lsr x17, x7, #1
+; CHECK-GI-NEXT:    orr x13, x18, x6, lsr #1
+; CHECK-GI-NEXT:    adds x12, x16, x12
+; CHECK-GI-NEXT:    lsl x16, x9, #62
+; CHECK-GI-NEXT:    adc x10, x15, x10
+; CHECK-GI-NEXT:    mov w15, #7 // =0x7
+; CHECK-GI-NEXT:    lsr x9, x9, #2
+; CHECK-GI-NEXT:    adds x13, x13, x14
+; CHECK-GI-NEXT:    lsl x14, x10, #62
+; CHECK-GI-NEXT:    orr x11, x16, x11, lsr #2
+; CHECK-GI-NEXT:    adc x8, x17, x8
+; CHECK-GI-NEXT:    lsl x17, x9, #3
+; CHECK-GI-NEXT:    lsr x10, x10, #2
+; CHECK-GI-NEXT:    orr x12, x14, x12, lsr #2
+; CHECK-GI-NEXT:    lsl x14, x8, #62
+; CHECK-GI-NEXT:    umulh x16, x11, x15
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x18, x11, #3
+; CHECK-GI-NEXT:    sub x9, x17, x9
+; CHECK-GI-NEXT:    orr x13, x14, x13, lsr #2
+; CHECK-GI-NEXT:    umulh x14, x12, x15
+; CHECK-GI-NEXT:    lsl x17, x12, #3
+; CHECK-GI-NEXT:    lsl x6, x10, #3
+; CHECK-GI-NEXT:    sub x11, x18, x11
+; CHECK-GI-NEXT:    umulh x15, x13, x15
+; CHECK-GI-NEXT:    sub x12, x17, x12
+; CHECK-GI-NEXT:    lsl x17, x8, #3
+; CHECK-GI-NEXT:    add x9, x9, x16
+; CHECK-GI-NEXT:    lsl x16, x13, #3
+; CHECK-GI-NEXT:    sub x10, x6, x10
+; CHECK-GI-NEXT:    subs x0, x0, x11
+; CHECK-GI-NEXT:    sub x8, x17, x8
+; CHECK-GI-NEXT:    add x10, x10, x14
+; CHECK-GI-NEXT:    sub x13, x16, x13
+; CHECK-GI-NEXT:    sbc x1, x1, x9
+; CHECK-GI-NEXT:    subs x2, x2, x12
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    sbc x3, x3, x10
+; CHECK-GI-NEXT:    subs x4, x4, x13
+; CHECK-GI-NEXT:    sbc x5, x5, x8
+; CHECK-GI-NEXT:    ldr x21, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i128> %d, <i128 7, i128 7, i128 7>
+  ret <3 x i128> %s
+}
+
+define <3 x i128> @uv3i128_100(<3 x i128> %d, <3 x i128> %e) {
+; CHECK-SD-LABEL: uv3i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w30, -64
+; CHECK-SD-NEXT:    mov x21, x3
+; CHECK-SD-NEXT:    mov x22, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x4
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x4, x0
+; CHECK-SD-NEXT:    mov x5, x1
+; CHECK-SD-NEXT:    mov x0, x23
+; CHECK-SD-NEXT:    mov x1, x24
+; CHECK-SD-NEXT:    mov x2, x21
+; CHECK-SD-NEXT:    mov x3, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x21, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -32
+; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x9, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    and x21, xzr, #0x1
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x9, #23592, lsl #16
+; CHECK-GI-NEXT:    umulh x12, x0, xzr
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x9, #49807, lsl #32
+; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x9, #10485, lsl #48
+; CHECK-GI-NEXT:    mul x11, x1, x10
+; CHECK-GI-NEXT:    mul x14, x0, x9
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    mul x16, x1, x9
+; CHECK-GI-NEXT:    adds x11, x11, x14
+; CHECK-GI-NEXT:    umulh x17, x1, x10
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    cmn x11, x15
+; CHECK-GI-NEXT:    sub x15, x0, x0
+; CHECK-GI-NEXT:    and x19, x19, #0x1
+; CHECK-GI-NEXT:    umulh x18, x0, x9
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    add x15, x16, x15
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    and x16, xzr, #0x1
+; CHECK-GI-NEXT:    mul x6, x3, x10
+; CHECK-GI-NEXT:    add x11, x19, x11
+; CHECK-GI-NEXT:    add x16, x21, x16
+; CHECK-GI-NEXT:    adds x15, x15, x17
+; CHECK-GI-NEXT:    mul x7, x2, x9
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    and x19, x19, #0x1
+; CHECK-GI-NEXT:    adds x15, x15, x18
+; CHECK-GI-NEXT:    umulh x14, x2, x10
+; CHECK-GI-NEXT:    add x16, x16, x19
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    adds x11, x15, x11
+; CHECK-GI-NEXT:    and x15, x19, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x10
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    add x15, x16, x15
+; CHECK-GI-NEXT:    adds x6, x6, x7
+; CHECK-GI-NEXT:    and x16, x19, #0x1
+; CHECK-GI-NEXT:    umulh x13, x1, x9
+; CHECK-GI-NEXT:    cset w7, hs
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    cmn x6, x14
+; CHECK-GI-NEXT:    sub x14, x2, x2
+; CHECK-GI-NEXT:    mul x20, x3, x9
+; CHECK-GI-NEXT:    cset w6, hs
+; CHECK-GI-NEXT:    and x6, x6, #0x1
+; CHECK-GI-NEXT:    umulh x17, x3, x10
+; CHECK-GI-NEXT:    add x13, x8, x13
+; CHECK-GI-NEXT:    umulh x18, x2, x9
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, x7, #0x1
+; CHECK-GI-NEXT:    add x14, x20, x14
+; CHECK-GI-NEXT:    and x7, xzr, #0x1
+; CHECK-GI-NEXT:    and x20, xzr, #0x1
+; CHECK-GI-NEXT:    mul x16, x5, x10
+; CHECK-GI-NEXT:    add x7, x7, x20
+; CHECK-GI-NEXT:    add x13, x13, x6
+; CHECK-GI-NEXT:    adds x14, x14, x17
+; CHECK-GI-NEXT:    mul x17, x4, x9
+; CHECK-GI-NEXT:    cset w20, hs
+; CHECK-GI-NEXT:    and x20, x20, #0x1
+; CHECK-GI-NEXT:    adds x14, x14, x18
+; CHECK-GI-NEXT:    umulh x21, x3, x9
+; CHECK-GI-NEXT:    add x6, x7, x20
+; CHECK-GI-NEXT:    cset w7, hs
+; CHECK-GI-NEXT:    and x7, x7, #0x1
+; CHECK-GI-NEXT:    adds x13, x14, x13
+; CHECK-GI-NEXT:    add x14, x12, x15
+; CHECK-GI-NEXT:    umulh x18, x4, x10
+; CHECK-GI-NEXT:    add x15, x6, x7
+; CHECK-GI-NEXT:    cset w6, hs
+; CHECK-GI-NEXT:    and x6, x6, #0x1
+; CHECK-GI-NEXT:    adds x16, x16, x17
+; CHECK-GI-NEXT:    lsl x7, x14, #60
+; CHECK-GI-NEXT:    umulh x19, x2, xzr
+; CHECK-GI-NEXT:    add x15, x15, x6
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    add x6, x8, x21
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    orr x11, x7, x11, lsr #4
+; CHECK-GI-NEXT:    mul x20, x5, x9
+; CHECK-GI-NEXT:    mov w12, #100 // =0x64
+; CHECK-GI-NEXT:    and x7, xzr, #0x1
+; CHECK-GI-NEXT:    cmn x16, x18
+; CHECK-GI-NEXT:    lsr x14, x14, #4
+; CHECK-GI-NEXT:    umulh x10, x5, x10
+; CHECK-GI-NEXT:    add x18, x6, x19
+; CHECK-GI-NEXT:    cset w6, hs
+; CHECK-GI-NEXT:    and x19, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x16, x4, x9
+; CHECK-GI-NEXT:    add x15, x18, x15
+; CHECK-GI-NEXT:    sub x18, x4, x4
+; CHECK-GI-NEXT:    and x6, x6, #0x1
+; CHECK-GI-NEXT:    add x18, x20, x18
+; CHECK-GI-NEXT:    add x7, x7, x19
+; CHECK-GI-NEXT:    umulh x9, x5, x9
+; CHECK-GI-NEXT:    add x17, x17, x6
+; CHECK-GI-NEXT:    lsl x20, x15, #60
+; CHECK-GI-NEXT:    adds x10, x18, x10
+; CHECK-GI-NEXT:    umulh x6, x4, xzr
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    orr x13, x20, x13, lsr #4
+; CHECK-GI-NEXT:    adds x10, x10, x16
+; CHECK-GI-NEXT:    and x18, x18, #0x1
+; CHECK-GI-NEXT:    umulh x16, x11, x12
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    adds x10, x10, x17
+; CHECK-GI-NEXT:    add x18, x7, x18
+; CHECK-GI-NEXT:    and x17, x19, #0x1
+; CHECK-GI-NEXT:    cset w7, hs
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x6
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    madd x14, x14, x12, x16
+; CHECK-GI-NEXT:    add x16, x18, x17
+; CHECK-GI-NEXT:    and x17, x7, #0x1
+; CHECK-GI-NEXT:    add x9, x16, x17
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    lsr x9, x15, #4
+; CHECK-GI-NEXT:    umulh x16, x13, x12
+; CHECK-GI-NEXT:    lsl x15, x8, #60
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    subs x0, x0, x11
+; CHECK-GI-NEXT:    mul x13, x13, x12
+; CHECK-GI-NEXT:    orr x10, x15, x10, lsr #4
+; CHECK-GI-NEXT:    sbc x1, x1, x14
+; CHECK-GI-NEXT:    madd x9, x9, x12, x16
+; CHECK-GI-NEXT:    umulh x15, x10, x12
+; CHECK-GI-NEXT:    subs x2, x2, x13
+; CHECK-GI-NEXT:    mul x10, x10, x12
+; CHECK-GI-NEXT:    sbc x3, x3, x9
+; CHECK-GI-NEXT:    madd x8, x8, x12, x15
+; CHECK-GI-NEXT:    subs x4, x4, x10
+; CHECK-GI-NEXT:    sbc x5, x5, x8
+; CHECK-GI-NEXT:    ldr x21, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i128> %d, <i128 100, i128 100, i128 100>
+  ret <3 x i128> %s
+}
+
+define <4 x i128> @uv4i128_7(<4 x i128> %d, <4 x i128> %e) {
+; CHECK-SD-LABEL: uv4i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w30, -80
+; CHECK-SD-NEXT:    mov x23, x3
+; CHECK-SD-NEXT:    mov x24, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x7
+; CHECK-SD-NEXT:    mov x20, x6
+; CHECK-SD-NEXT:    mov x21, x5
+; CHECK-SD-NEXT:    mov x22, x4
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x25, x0
+; CHECK-SD-NEXT:    mov x26, x1
+; CHECK-SD-NEXT:    mov x0, x24
+; CHECK-SD-NEXT:    mov x1, x23
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x6, x0
+; CHECK-SD-NEXT:    mov x7, x1
+; CHECK-SD-NEXT:    mov x0, x25
+; CHECK-SD-NEXT:    mov x1, x26
+; CHECK-SD-NEXT:    mov x2, x23
+; CHECK-SD-NEXT:    mov x3, x24
+; CHECK-SD-NEXT:    mov x4, x21
+; CHECK-SD-NEXT:    mov x5, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #80 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    mov x10, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x8, #9362 // =0x2492
+; CHECK-GI-NEXT:    sub x24, x0, x0
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    umulh x17, x0, xzr
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x12, x1, x10
+; CHECK-GI-NEXT:    mul x13, x0, x8
+; CHECK-GI-NEXT:    umulh x18, x0, x10
+; CHECK-GI-NEXT:    mul x21, x1, x8
+; CHECK-GI-NEXT:    adds x12, x12, x13
+; CHECK-GI-NEXT:    umulh x14, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    cmn x12, x18
+; CHECK-GI-NEXT:    and x18, x13, #0x1
+; CHECK-GI-NEXT:    umulh x11, x0, x8
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x21, x21, x24
+; CHECK-GI-NEXT:    and x25, x13, #0x1
+; CHECK-GI-NEXT:    and x24, xzr, #0x1
+; CHECK-GI-NEXT:    mul x22, x3, x10
+; CHECK-GI-NEXT:    add x18, x18, x25
+; CHECK-GI-NEXT:    and x25, xzr, #0x1
+; CHECK-GI-NEXT:    adds x21, x21, x14
+; CHECK-GI-NEXT:    add x24, x24, x25
+; CHECK-GI-NEXT:    mul x23, x2, x8
+; CHECK-GI-NEXT:    cset w26, hs
+; CHECK-GI-NEXT:    adds x11, x21, x11
+; CHECK-GI-NEXT:    and x25, x26, #0x1
+; CHECK-GI-NEXT:    and x26, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x20, x2, x10
+; CHECK-GI-NEXT:    cset w21, hs
+; CHECK-GI-NEXT:    adds x11, x11, x18
+; CHECK-GI-NEXT:    add x24, x24, x25
+; CHECK-GI-NEXT:    and x21, x21, #0x1
+; CHECK-GI-NEXT:    cset w25, hs
+; CHECK-GI-NEXT:    umulh x9, xzr, x10
+; CHECK-GI-NEXT:    add x24, x24, x21
+; CHECK-GI-NEXT:    and x25, x25, #0x1
+; CHECK-GI-NEXT:    adds x22, x22, x23
+; CHECK-GI-NEXT:    add x24, x24, x25
+; CHECK-GI-NEXT:    umulh x19, x1, x8
+; CHECK-GI-NEXT:    cset w25, hs
+; CHECK-GI-NEXT:    cmn x22, x20
+; CHECK-GI-NEXT:    sub x22, x2, x2
+; CHECK-GI-NEXT:    mul x15, x3, x8
+; CHECK-GI-NEXT:    cset w20, hs
+; CHECK-GI-NEXT:    and x20, x20, #0x1
+; CHECK-GI-NEXT:    umulh x16, x3, x10
+; CHECK-GI-NEXT:    add x23, x9, x19
+; CHECK-GI-NEXT:    umulh x12, x2, x8
+; CHECK-GI-NEXT:    add x17, x23, x17
+; CHECK-GI-NEXT:    and x23, x25, #0x1
+; CHECK-GI-NEXT:    add x15, x15, x22
+; CHECK-GI-NEXT:    and x22, xzr, #0x1
+; CHECK-GI-NEXT:    add x20, x23, x20
+; CHECK-GI-NEXT:    umulh x13, x3, x8
+; CHECK-GI-NEXT:    add x22, x26, x22
+; CHECK-GI-NEXT:    adds x15, x15, x16
+; CHECK-GI-NEXT:    umulh x14, x2, xzr
+; CHECK-GI-NEXT:    cset w23, hs
+; CHECK-GI-NEXT:    and x23, x23, #0x1
+; CHECK-GI-NEXT:    adds x12, x15, x12
+; CHECK-GI-NEXT:    mul x18, x5, x10
+; CHECK-GI-NEXT:    add x22, x22, x23
+; CHECK-GI-NEXT:    cset w23, hs
+; CHECK-GI-NEXT:    adds x12, x12, x20
+; CHECK-GI-NEXT:    and x20, x23, #0x1
+; CHECK-GI-NEXT:    add x13, x9, x13
+; CHECK-GI-NEXT:    mul x21, x4, x8
+; CHECK-GI-NEXT:    cset w23, hs
+; CHECK-GI-NEXT:    add x20, x22, x20
+; CHECK-GI-NEXT:    and x22, x23, #0x1
+; CHECK-GI-NEXT:    add x13, x13, x14
+; CHECK-GI-NEXT:    add x14, x17, x24
+; CHECK-GI-NEXT:    umulh x19, x4, x10
+; CHECK-GI-NEXT:    add x20, x20, x22
+; CHECK-GI-NEXT:    add x13, x13, x20
+; CHECK-GI-NEXT:    sub x20, x4, x4
+; CHECK-GI-NEXT:    mul x25, x5, x8
+; CHECK-GI-NEXT:    adds x17, x18, x21
+; CHECK-GI-NEXT:    umulh x16, x5, x10
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    cmn x17, x19
+; CHECK-GI-NEXT:    and x18, x18, #0x1
+; CHECK-GI-NEXT:    umulh x15, x4, x8
+; CHECK-GI-NEXT:    cset w19, hs
+; CHECK-GI-NEXT:    add x20, x25, x20
+; CHECK-GI-NEXT:    and x19, x19, #0x1
+; CHECK-GI-NEXT:    mul x17, x7, x10
+; CHECK-GI-NEXT:    add x18, x18, x19
+; CHECK-GI-NEXT:    and x19, xzr, #0x1
+; CHECK-GI-NEXT:    adds x16, x20, x16
+; CHECK-GI-NEXT:    and x20, xzr, #0x1
+; CHECK-GI-NEXT:    mul x21, x6, x8
+; CHECK-GI-NEXT:    cset w23, hs
+; CHECK-GI-NEXT:    add x19, x19, x20
+; CHECK-GI-NEXT:    adds x15, x16, x15
+; CHECK-GI-NEXT:    and x20, x23, #0x1
+; CHECK-GI-NEXT:    and x23, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x24, x6, x10
+; CHECK-GI-NEXT:    cset w16, hs
+; CHECK-GI-NEXT:    adds x15, x15, x18
+; CHECK-GI-NEXT:    add x19, x19, x20
+; CHECK-GI-NEXT:    and x16, x16, #0x1
+; CHECK-GI-NEXT:    cset w20, hs
+; CHECK-GI-NEXT:    umulh x26, x5, x8
+; CHECK-GI-NEXT:    add x16, x19, x16
+; CHECK-GI-NEXT:    and x19, x20, #0x1
+; CHECK-GI-NEXT:    adds x17, x17, x21
+; CHECK-GI-NEXT:    add x16, x16, x19
+; CHECK-GI-NEXT:    umulh x22, x4, xzr
+; CHECK-GI-NEXT:    cset w20, hs
+; CHECK-GI-NEXT:    cmn x17, x24
+; CHECK-GI-NEXT:    and x20, x20, #0x1
+; CHECK-GI-NEXT:    mul x18, x7, x8
+; CHECK-GI-NEXT:    cset w21, hs
+; CHECK-GI-NEXT:    add x19, x9, x26
+; CHECK-GI-NEXT:    and x21, x21, #0x1
+; CHECK-GI-NEXT:    umulh x10, x7, x10
+; CHECK-GI-NEXT:    add x20, x20, x21
+; CHECK-GI-NEXT:    add x19, x19, x22
+; CHECK-GI-NEXT:    sub x22, x6, x6
+; CHECK-GI-NEXT:    umulh x17, x6, x8
+; CHECK-GI-NEXT:    add x16, x19, x16
+; CHECK-GI-NEXT:    add x18, x18, x22
+; CHECK-GI-NEXT:    and x22, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, x7, x8
+; CHECK-GI-NEXT:    add x21, x23, x22
+; CHECK-GI-NEXT:    adds x10, x18, x10
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x10, x10, x17
+; CHECK-GI-NEXT:    umulh x17, x6, xzr
+; CHECK-GI-NEXT:    and x18, x18, #0x1
+; CHECK-GI-NEXT:    cset w22, hs
+; CHECK-GI-NEXT:    adds x10, x10, x20
+; CHECK-GI-NEXT:    add x18, x21, x18
+; CHECK-GI-NEXT:    and x20, x22, #0x1
+; CHECK-GI-NEXT:    cset w21, hs
+; CHECK-GI-NEXT:    add x8, x9, x8
+; CHECK-GI-NEXT:    add x18, x18, x20
+; CHECK-GI-NEXT:    and x19, x21, #0x1
+; CHECK-GI-NEXT:    subs x9, x0, x11
+; CHECK-GI-NEXT:    add x18, x18, x19
+; CHECK-GI-NEXT:    add x8, x8, x17
+; CHECK-GI-NEXT:    sbc x17, x1, x14
+; CHECK-GI-NEXT:    subs x19, x2, x12
+; CHECK-GI-NEXT:    add x8, x8, x18
+; CHECK-GI-NEXT:    sbc x18, x3, x13
+; CHECK-GI-NEXT:    subs x20, x4, x15
+; CHECK-GI-NEXT:    lsl x23, x17, #63
+; CHECK-GI-NEXT:    sbc x21, x5, x16
+; CHECK-GI-NEXT:    subs x22, x6, x10
+; CHECK-GI-NEXT:    lsl x25, x18, #63
+; CHECK-GI-NEXT:    sbc x24, x7, x8
+; CHECK-GI-NEXT:    lsr x17, x17, #1
+; CHECK-GI-NEXT:    orr x9, x23, x9, lsr #1
+; CHECK-GI-NEXT:    lsl x23, x21, #63
+; CHECK-GI-NEXT:    lsr x18, x18, #1
+; CHECK-GI-NEXT:    orr x19, x25, x19, lsr #1
+; CHECK-GI-NEXT:    lsl x25, x24, #63
+; CHECK-GI-NEXT:    lsr x21, x21, #1
+; CHECK-GI-NEXT:    orr x20, x23, x20, lsr #1
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    orr x11, x25, x22, lsr #1
+; CHECK-GI-NEXT:    adc x14, x17, x14
+; CHECK-GI-NEXT:    adds x12, x19, x12
+; CHECK-GI-NEXT:    adc x13, x18, x13
+; CHECK-GI-NEXT:    adds x15, x20, x15
+; CHECK-GI-NEXT:    lsl x17, x14, #62
+; CHECK-GI-NEXT:    adc x16, x21, x16
+; CHECK-GI-NEXT:    adds x10, x11, x10
+; CHECK-GI-NEXT:    lsl x11, x13, #62
+; CHECK-GI-NEXT:    lsr x18, x24, #1
+; CHECK-GI-NEXT:    orr x9, x17, x9, lsr #2
+; CHECK-GI-NEXT:    mov w17, #7 // =0x7
+; CHECK-GI-NEXT:    lsr x14, x14, #2
+; CHECK-GI-NEXT:    orr x11, x11, x12, lsr #2
+; CHECK-GI-NEXT:    lsl x12, x16, #62
+; CHECK-GI-NEXT:    adc x8, x18, x8
+; CHECK-GI-NEXT:    umulh x18, x9, x17
+; CHECK-GI-NEXT:    lsr x13, x13, #2
+; CHECK-GI-NEXT:    lsl x19, x8, #62
+; CHECK-GI-NEXT:    orr x12, x12, x15, lsr #2
+; CHECK-GI-NEXT:    lsl x15, x14, #3
+; CHECK-GI-NEXT:    lsr x16, x16, #2
+; CHECK-GI-NEXT:    umulh x20, x11, x17
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    orr x10, x19, x10, lsr #2
+; CHECK-GI-NEXT:    lsl x19, x9, #3
+; CHECK-GI-NEXT:    sub x14, x15, x14
+; CHECK-GI-NEXT:    lsl x15, x13, #3
+; CHECK-GI-NEXT:    umulh x21, x12, x17
+; CHECK-GI-NEXT:    sub x9, x19, x9
+; CHECK-GI-NEXT:    add x14, x14, x18
+; CHECK-GI-NEXT:    lsl x18, x11, #3
+; CHECK-GI-NEXT:    sub x13, x15, x13
+; CHECK-GI-NEXT:    lsl x15, x12, #3
+; CHECK-GI-NEXT:    lsl x19, x16, #3
+; CHECK-GI-NEXT:    umulh x17, x10, x17
+; CHECK-GI-NEXT:    sub x11, x18, x11
+; CHECK-GI-NEXT:    lsl x18, x8, #3
+; CHECK-GI-NEXT:    sub x12, x15, x12
+; CHECK-GI-NEXT:    sub x15, x19, x16
+; CHECK-GI-NEXT:    lsl x16, x10, #3
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x13, x13, x20
+; CHECK-GI-NEXT:    add x15, x15, x21
+; CHECK-GI-NEXT:    sbc x1, x1, x14
+; CHECK-GI-NEXT:    subs x2, x2, x11
+; CHECK-GI-NEXT:    sub x9, x16, x10
+; CHECK-GI-NEXT:    sub x8, x18, x8
+; CHECK-GI-NEXT:    sbc x3, x3, x13
+; CHECK-GI-NEXT:    subs x4, x4, x12
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add x8, x8, x17
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sbc x5, x5, x15
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    subs x6, x6, x9
+; CHECK-GI-NEXT:    sbc x7, x7, x8
+; CHECK-GI-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i128> %d, <i128 7, i128 7, i128 7, i128 7>
+  ret <4 x i128> %s
+}
+
+define <4 x i128> @uv4i128_100(<4 x i128> %d, <4 x i128> %e) {
+; CHECK-SD-LABEL: uv4i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -56
+; CHECK-SD-NEXT:    .cfi_offset w26, -64
+; CHECK-SD-NEXT:    .cfi_offset w30, -80
+; CHECK-SD-NEXT:    mov x23, x3
+; CHECK-SD-NEXT:    mov x24, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    mov x19, x7
+; CHECK-SD-NEXT:    mov x20, x6
+; CHECK-SD-NEXT:    mov x21, x5
+; CHECK-SD-NEXT:    mov x22, x4
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x25, x0
+; CHECK-SD-NEXT:    mov x26, x1
+; CHECK-SD-NEXT:    mov x0, x24
+; CHECK-SD-NEXT:    mov x1, x23
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x23, x0
+; CHECK-SD-NEXT:    mov x24, x1
+; CHECK-SD-NEXT:    mov x0, x22
+; CHECK-SD-NEXT:    mov x1, x21
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    mov x6, x0
+; CHECK-SD-NEXT:    mov x7, x1
+; CHECK-SD-NEXT:    mov x0, x25
+; CHECK-SD-NEXT:    mov x1, x26
+; CHECK-SD-NEXT:    mov x2, x23
+; CHECK-SD-NEXT:    mov x3, x24
+; CHECK-SD-NEXT:    mov x4, x21
+; CHECK-SD-NEXT:    mov x5, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #80 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x29, [sp, #-96]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x9, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    sub x27, x0, x0
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x9, #23592, lsl #16
+; CHECK-GI-NEXT:    and x28, xzr, #0x1
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x9, #49807, lsl #32
+; CHECK-GI-NEXT:    umulh x13, x0, xzr
+; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x9, #10485, lsl #48
+; CHECK-GI-NEXT:    and x29, xzr, #0x1
+; CHECK-GI-NEXT:    mul x15, x1, x10
+; CHECK-GI-NEXT:    mul x18, x0, x9
+; CHECK-GI-NEXT:    umulh x17, x0, x10
+; CHECK-GI-NEXT:    mul x19, x1, x9
+; CHECK-GI-NEXT:    adds x18, x15, x18
+; CHECK-GI-NEXT:    umulh x20, x1, x10
+; CHECK-GI-NEXT:    cset w26, hs
+; CHECK-GI-NEXT:    cmn x18, x17
+; CHECK-GI-NEXT:    and x26, x26, #0x1
+; CHECK-GI-NEXT:    umulh x11, x0, x9
+; CHECK-GI-NEXT:    cset w17, hs
+; CHECK-GI-NEXT:    add x19, x19, x27
+; CHECK-GI-NEXT:    and x17, x17, #0x1
+; CHECK-GI-NEXT:    and x27, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x10
+; CHECK-GI-NEXT:    add x17, x26, x17
+; CHECK-GI-NEXT:    add x26, x28, x27
+; CHECK-GI-NEXT:    adds x20, x19, x20
+; CHECK-GI-NEXT:    umulh x21, x1, x9
+; CHECK-GI-NEXT:    cset w27, hs
+; CHECK-GI-NEXT:    and x27, x27, #0x1
+; CHECK-GI-NEXT:    adds x11, x20, x11
+; CHECK-GI-NEXT:    mul x24, x3, x10
+; CHECK-GI-NEXT:    add x27, x26, x27
+; CHECK-GI-NEXT:    cset w26, hs
+; CHECK-GI-NEXT:    adds x11, x11, x17
+; CHECK-GI-NEXT:    and x17, x26, #0x1
+; CHECK-GI-NEXT:    mul x25, x2, x9
+; CHECK-GI-NEXT:    cset w28, hs
+; CHECK-GI-NEXT:    add x17, x27, x17
+; CHECK-GI-NEXT:    and x27, x28, #0x1
+; CHECK-GI-NEXT:    add x21, x8, x21
+; CHECK-GI-NEXT:    umulh x16, x2, x10
+; CHECK-GI-NEXT:    add x17, x17, x27
+; CHECK-GI-NEXT:    add x27, x21, x13
+; CHECK-GI-NEXT:    sub x13, x2, x2
+; CHECK-GI-NEXT:    add x17, x27, x17
+; CHECK-GI-NEXT:    mul x22, x3, x9
+; CHECK-GI-NEXT:    adds x24, x24, x25
+; CHECK-GI-NEXT:    umulh x23, x3, x10
+; CHECK-GI-NEXT:    cset w28, hs
+; CHECK-GI-NEXT:    cmn x24, x16
+; CHECK-GI-NEXT:    and x21, x28, #0x1
+; CHECK-GI-NEXT:    umulh x12, x2, x9
+; CHECK-GI-NEXT:    cset w28, hs
+; CHECK-GI-NEXT:    add x13, x22, x13
+; CHECK-GI-NEXT:    and x22, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x14, x3, x9
+; CHECK-GI-NEXT:    add x22, x29, x22
+; CHECK-GI-NEXT:    adds x13, x13, x23
+; CHECK-GI-NEXT:    and x23, x28, #0x1
+; CHECK-GI-NEXT:    umulh x15, x2, xzr
+; CHECK-GI-NEXT:    cset w28, hs
+; CHECK-GI-NEXT:    add x21, x21, x23
+; CHECK-GI-NEXT:    adds x12, x13, x12
+; CHECK-GI-NEXT:    and x28, x28, #0x1
+; CHECK-GI-NEXT:    mul x18, x5, x10
+; CHECK-GI-NEXT:    cset w23, hs
+; CHECK-GI-NEXT:    add x22, x22, x28
+; CHECK-GI-NEXT:    and x23, x23, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x21
+; CHECK-GI-NEXT:    add x14, x8, x14
+; CHECK-GI-NEXT:    mul x19, x4, x9
+; CHECK-GI-NEXT:    add x22, x22, x23
+; CHECK-GI-NEXT:    cset w23, hs
+; CHECK-GI-NEXT:    add x15, x14, x15
+; CHECK-GI-NEXT:    and x27, x23, #0x1
+; CHECK-GI-NEXT:    umulh x20, x4, x10
+; CHECK-GI-NEXT:    add x27, x22, x27
+; CHECK-GI-NEXT:    add x15, x15, x27
+; CHECK-GI-NEXT:    mul x26, x5, x9
+; CHECK-GI-NEXT:    adds x14, x18, x19
+; CHECK-GI-NEXT:    sub x19, x4, x4
+; CHECK-GI-NEXT:    umulh x25, x5, x10
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    cmn x14, x20
+; CHECK-GI-NEXT:    and x28, x18, #0x1
+; CHECK-GI-NEXT:    umulh x24, x4, x9
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    add x14, x26, x19
+; CHECK-GI-NEXT:    and x20, x18, #0x1
+; CHECK-GI-NEXT:    mul x21, x7, x10
+; CHECK-GI-NEXT:    add x20, x28, x20
+; CHECK-GI-NEXT:    adds x19, x14, x25
+; CHECK-GI-NEXT:    and x25, xzr, #0x1
+; CHECK-GI-NEXT:    mul x23, x6, x9
+; CHECK-GI-NEXT:    cset w26, hs
+; CHECK-GI-NEXT:    adds x24, x19, x24
+; CHECK-GI-NEXT:    and x19, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x22, x6, x10
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    umulh x18, x7, x10
+; CHECK-GI-NEXT:    add x10, x25, x19
+; CHECK-GI-NEXT:    and x19, x26, #0x1
+; CHECK-GI-NEXT:    cset w25, hs
+; CHECK-GI-NEXT:    add x10, x10, x19
+; CHECK-GI-NEXT:    umulh x16, x5, x9
+; CHECK-GI-NEXT:    and x25, x25, #0x1
+; CHECK-GI-NEXT:    add x25, x10, x25
+; CHECK-GI-NEXT:    adds x10, x24, x20
+; CHECK-GI-NEXT:    lsl x24, x17, #60
+; CHECK-GI-NEXT:    umulh x13, x4, xzr
+; CHECK-GI-NEXT:    cset w26, hs
+; CHECK-GI-NEXT:    adds x21, x21, x23
+; CHECK-GI-NEXT:    and x23, x26, #0x1
+; CHECK-GI-NEXT:    orr x11, x24, x11, lsr #4
+; CHECK-GI-NEXT:    lsr x17, x17, #4
+; CHECK-GI-NEXT:    mul x14, x7, x9
+; CHECK-GI-NEXT:    add x23, x25, x23
+; CHECK-GI-NEXT:    cset w25, hs
+; CHECK-GI-NEXT:    cmn x21, x22
+; CHECK-GI-NEXT:    add x16, x8, x16
+; CHECK-GI-NEXT:    umulh x19, x6, x9
+; CHECK-GI-NEXT:    cset w21, hs
+; CHECK-GI-NEXT:    add x13, x16, x13
+; CHECK-GI-NEXT:    and x16, x25, #0x1
+; CHECK-GI-NEXT:    and x21, x21, #0x1
+; CHECK-GI-NEXT:    umulh x20, x7, x9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    add x16, x16, x21
+; CHECK-GI-NEXT:    sub x21, x6, x6
+; CHECK-GI-NEXT:    lsl x25, x15, #60
+; CHECK-GI-NEXT:    add x13, x13, x23
+; CHECK-GI-NEXT:    umulh x22, x11, x9
+; CHECK-GI-NEXT:    add x14, x14, x21
+; CHECK-GI-NEXT:    and x21, xzr, #0x1
+; CHECK-GI-NEXT:    adds x14, x14, x18
+; CHECK-GI-NEXT:    lsl x23, x13, #60
+; CHECK-GI-NEXT:    orr x12, x25, x12, lsr #4
+; CHECK-GI-NEXT:    umulh x24, x6, xzr
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x14, x14, x19
+; CHECK-GI-NEXT:    and x18, x18, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x20
+; CHECK-GI-NEXT:    orr x10, x23, x10, lsr #4
+; CHECK-GI-NEXT:    madd x17, x17, x9, x22
+; CHECK-GI-NEXT:    and x22, xzr, #0x1
+; CHECK-GI-NEXT:    lsr x15, x15, #4
+; CHECK-GI-NEXT:    add x21, x21, x22
+; CHECK-GI-NEXT:    cset w22, hs
+; CHECK-GI-NEXT:    adds x14, x14, x16
+; CHECK-GI-NEXT:    add x18, x21, x18
+; CHECK-GI-NEXT:    and x16, x22, #0x1
+; CHECK-GI-NEXT:    cset w21, hs
+; CHECK-GI-NEXT:    add x16, x18, x16
+; CHECK-GI-NEXT:    and x18, x21, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x24
+; CHECK-GI-NEXT:    add x16, x16, x18
+; CHECK-GI-NEXT:    umulh x19, x12, x9
+; CHECK-GI-NEXT:    lsr x13, x13, #4
+; CHECK-GI-NEXT:    add x8, x8, x16
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    lsl x16, x8, #60
+; CHECK-GI-NEXT:    mul x11, x11, x9
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    orr x14, x16, x14, lsr #4
+; CHECK-GI-NEXT:    umulh x18, x10, x9
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mul x12, x12, x9
+; CHECK-GI-NEXT:    subs x0, x0, x11
+; CHECK-GI-NEXT:    umulh x16, x14, x9
+; CHECK-GI-NEXT:    sbc x1, x1, x17
+; CHECK-GI-NEXT:    madd x15, x15, x9, x19
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    subs x2, x2, x12
+; CHECK-GI-NEXT:    mul x10, x10, x9
+; CHECK-GI-NEXT:    madd x13, x13, x9, x18
+; CHECK-GI-NEXT:    sbc x3, x3, x15
+; CHECK-GI-NEXT:    mul x14, x14, x9
+; CHECK-GI-NEXT:    subs x4, x4, x10
+; CHECK-GI-NEXT:    madd x8, x8, x9, x16
+; CHECK-GI-NEXT:    sbc x5, x5, x13
+; CHECK-GI-NEXT:    subs x6, x6, x14
+; CHECK-GI-NEXT:    sbc x7, x7, x8
+; CHECK-GI-NEXT:    ldr x29, [sp], #96 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i128> %d, <i128 100, i128 100, i128 100, i128 100>
+  ret <4 x i128> %s
+}



More information about the llvm-commits mailing list