[llvm] [GlobalISel] Allow expansion of urem by constant in prelegalizer (PR #145914)

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 1 05:04:23 PDT 2025


================
@@ -0,0 +1,7966 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i8 @si8_7(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: si8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    mov w9, #-109 // =0xffffff93
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    add w8, w0, w8, lsr #8
+; CHECK-SD-NEXT:    sbfx w9, w8, #2, #6
+; CHECK-SD-NEXT:    and w8, w8, #0x80
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #7
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i8 %a, 7
+  ret i8 %s
+}
+
+define i8 @si8_100(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: si8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    mov w9, #41 // =0x29
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #12
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i8 %a, 100
+  ret i8 %s
+}
+
+define i8 @ui8_7(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: ui8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #37 // =0x25
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #8
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i8 %a, 7
+  ret i8 %s
+}
+
+define i8 @ui8_100(i8 %a, i8 %b) {
+; CHECK-SD-LABEL: ui8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #41 // =0x29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #12
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #8
+; CHECK-GI-NEXT:    lsr w8, w8, #4
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i8 %a, 100
+  ret i8 %s
+}
+
+define i16 @si16_7(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: si16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #17
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i16 %a, 7
+  ret i16 %s
+}
+
+define i16 @si16_100(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: si16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    asr w9, w8, #19
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i16 %a, 100
+  ret i16 %s
+}
+
+define i16 @ui16_7(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: ui16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    and w9, w0, #0xffff
+; CHECK-SD-NEXT:    mul w8, w9, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #16
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    and w9, w9, #0xfffe
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    and w9, w0, #0xffff
+; CHECK-GI-NEXT:    mul w8, w9, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    ubfx w9, w9, #1, #15
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    ubfx w8, w8, #2, #14
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i16 %a, 7
+  ret i16 %s
+}
+
+define i16 @ui16_100(i16 %a, i16 %b) {
+; CHECK-SD-LABEL: ui16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ubfx w8, w0, #2, #14
+; CHECK-SD-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-SD-NEXT:    mul w8, w8, w9
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    lsr w8, w8, #17
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ubfx w8, w0, #2, #14
+; CHECK-GI-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    lsr w8, w8, #1
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i16 %a, 100
+  ret i16 %s
+}
+
+define i32 @si32_7(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: si32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w9, w8, #2
+; CHECK-SD-NEXT:    add w8, w9, w8, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w8, w0, w8
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i32 %a, 7
+  ret i32 %s
+}
+
+define i32 @si32_100(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: si32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w9, w0, w8
+; CHECK-GI-NEXT:    msub w0, w9, w8, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i32 %a, 100
+  ret i32 %s
+}
+
+define i32 @ui32_7(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: ui32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w9, w0, w8
+; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk w8, #9362, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT:    lsr w8, w8, #2
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w0, w0, w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i32 %a, 7
+  ret i32 %s
+}
+
+define i32 @ui32_100(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: ui32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    umull x8, w0, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    lsr w8, w8, #5
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i32 %a, 100
+  ret i32 %s
+}
+
+define i64 @si64_7(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: si64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x8, x0, x8
+; CHECK-SD-NEXT:    asr x9, x8, #1
+; CHECK-SD-NEXT:    add x8, x9, x8, lsr #63
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv x8, x0, x8
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i64 %a, 7
+  ret i64 %s
+}
+
+define i64 @si64_100(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: si64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x8, x0, x8
+; CHECK-SD-NEXT:    add x8, x8, x0
+; CHECK-SD-NEXT:    asr x9, x8, #6
+; CHECK-SD-NEXT:    add x8, x9, x8, lsr #63
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub x0, x8, x9, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: si64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv x9, x0, x8
+; CHECK-GI-NEXT:    msub x0, x9, x8, x0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem i64 %a, 100
+  ret i64 %s
+}
+
+define i64 @ui64_7(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: ui64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x8, x0, x8
+; CHECK-SD-NEXT:    sub x9, x0, x8
+; CHECK-SD-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x0, x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x8, x0, x8
+; CHECK-GI-NEXT:    sub x9, x0, x8
+; CHECK-GI-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x0, x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i64 %a, 7
+  ret i64 %s
+}
+
+define i64 @ui64_100(i64 %a, i64 %b) {
+; CHECK-LABEL: ui64_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x9, #62915 // =0xf5c3
+; CHECK-NEXT:    lsr x8, x0, #2
+; CHECK-NEXT:    movk x9, #23592, lsl #16
+; CHECK-NEXT:    movk x9, #49807, lsl #32
+; CHECK-NEXT:    movk x9, #10485, lsl #48
+; CHECK-NEXT:    umulh x8, x8, x9
+; CHECK-NEXT:    mov w9, #100 // =0x64
+; CHECK-NEXT:    lsr x8, x8, #2
+; CHECK-NEXT:    msub x0, x8, x9, x0
+; CHECK-NEXT:    ret
+entry:
+  %s = urem i64 %a, 100
+  ret i64 %s
+}
+
+define i128 @si128_7(i128 %a, i128 %b) {
+; CHECK-LABEL: si128_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov w2, #7 // =0x7
+; CHECK-NEXT:    mov x3, xzr
+; CHECK-NEXT:    bl __modti3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %s = srem i128 %a, 7
+  ret i128 %s
+}
+
+define i128 @si128_100(i128 %a, i128 %b) {
+; CHECK-LABEL: si128_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov w2, #100 // =0x64
+; CHECK-NEXT:    mov x3, xzr
+; CHECK-NEXT:    bl __modti3
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %s = srem i128 %a, 100
+  ret i128 %s
+}
+
+define i128 @ui128_7(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: ui128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT:    mov x10, #9362 // =0x2492
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x10, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x10, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #48
+; CHECK-GI-NEXT:    movk x10, #9362, lsl #48
+; CHECK-GI-NEXT:    mul x9, x1, x8
+; CHECK-GI-NEXT:    mul x11, x0, x10
+; CHECK-GI-NEXT:    umulh x12, x0, x8
+; CHECK-GI-NEXT:    mul x13, x1, x10
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    umulh x14, x1, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    cmn x9, x12
+; CHECK-GI-NEXT:    and x9, x11, #0x1
+; CHECK-GI-NEXT:    sub x12, x0, x0
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x8
+; CHECK-GI-NEXT:    add x9, x9, x11
+; CHECK-GI-NEXT:    and x11, xzr, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x14
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    umulh x10, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    adds x12, x12, x15
+; CHECK-GI-NEXT:    and x13, x13, #0x1
+; CHECK-GI-NEXT:    umulh x14, x0, xzr
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    adds x9, x12, x9
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    and x12, x15, #0x1
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, x13, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    add x10, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    subs x10, x0, x9
+; CHECK-GI-NEXT:    sbc x11, x1, x8
+; CHECK-GI-NEXT:    lsl x12, x11, #63
+; CHECK-GI-NEXT:    lsr x11, x11, #1
+; CHECK-GI-NEXT:    orr x10, x12, x10, lsr #1
+; CHECK-GI-NEXT:    adds x9, x10, x9
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsl x10, x8, #62
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #2
+; CHECK-GI-NEXT:    mov w10, #7 // =0x7
+; CHECK-GI-NEXT:    lsl x12, x8, #3
+; CHECK-GI-NEXT:    umulh x10, x9, x10
+; CHECK-GI-NEXT:    lsl x11, x9, #3
+; CHECK-GI-NEXT:    sub x8, x12, x8
+; CHECK-GI-NEXT:    sub x9, x11, x9
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    sbc x1, x1, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i128 %a, 7
+  ret i128 %s
+}
+
+define i128 @ui128_100(i128 %a, i128 %b) {
+; CHECK-SD-LABEL: ui128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __umodti3
+; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ui128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #23593 // =0x5c29
+; CHECK-GI-NEXT:    mov x10, #62914 // =0xf5c2
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #16
+; CHECK-GI-NEXT:    movk x10, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #32
+; CHECK-GI-NEXT:    movk x10, #49807, lsl #32
+; CHECK-GI-NEXT:    movk x8, #36700, lsl #48
+; CHECK-GI-NEXT:    movk x10, #10485, lsl #48
+; CHECK-GI-NEXT:    mul x9, x1, x8
+; CHECK-GI-NEXT:    mul x11, x0, x10
+; CHECK-GI-NEXT:    umulh x12, x0, x8
+; CHECK-GI-NEXT:    mul x13, x1, x10
+; CHECK-GI-NEXT:    adds x9, x9, x11
+; CHECK-GI-NEXT:    umulh x14, x1, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    cmn x9, x12
+; CHECK-GI-NEXT:    and x9, x11, #0x1
+; CHECK-GI-NEXT:    sub x12, x0, x0
+; CHECK-GI-NEXT:    umulh x15, x0, x10
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    add x12, x13, x12
+; CHECK-GI-NEXT:    and x13, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x8, xzr, x8
+; CHECK-GI-NEXT:    add x9, x9, x11
+; CHECK-GI-NEXT:    and x11, xzr, #0x1
+; CHECK-GI-NEXT:    adds x12, x12, x14
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    umulh x10, x1, x10
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    adds x12, x12, x15
+; CHECK-GI-NEXT:    and x13, x13, #0x1
+; CHECK-GI-NEXT:    umulh x14, x0, xzr
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    adds x9, x12, x9
+; CHECK-GI-NEXT:    add x11, x11, x13
+; CHECK-GI-NEXT:    and x12, x15, #0x1
+; CHECK-GI-NEXT:    cset w13, hs
+; CHECK-GI-NEXT:    add x11, x11, x12
+; CHECK-GI-NEXT:    and x12, x13, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    add x10, x11, x12
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x10
+; CHECK-GI-NEXT:    lsl x10, x8, #60
+; CHECK-GI-NEXT:    lsr x8, x8, #4
+; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #4
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    umulh x11, x9, x10
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    madd x8, x8, x10, x11
+; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    sbc x1, x1, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem i128 %a, 100
+  ret i128 %s
+}
+
+define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: sv2i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v1.2s, #24
+; CHECK-SD-NEXT:    smull v2.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #32
+; CHECK-SD-NEXT:    ssra v2.2s, v1.2s, #24
+; CHECK-SD-NEXT:    sshr v1.2s, v2.2s, #2
+; CHECK-SD-NEXT:    usra v1.2s, v2.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    smov w11, v1.h[1]
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i8> %d, <i8 7, i8 7>
+  ret <2 x i8> %s
+}
+
+define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: sv2i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    smov w11, v1.h[1]
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i8> %d, <i8 100, i8 100>
+  ret <2 x i8> %s
+}
+
+define <3 x i8> @sv3i8_7(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: sv3i8_7:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    sxtb x8, w0
+; CHECK-SD-NEXT:    mov x9, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    sxtb x10, w1
+; CHECK-SD-NEXT:    sxtb x11, w2
+; CHECK-SD-NEXT:    movk x9, #37449, lsl #16
+; CHECK-SD-NEXT:    sxtb w12, w1
+; CHECK-SD-NEXT:    smull x8, w8, w9
+; CHECK-SD-NEXT:    sxtb w13, w0
+; CHECK-SD-NEXT:    smull x10, w10, w9
+; CHECK-SD-NEXT:    smull x9, w11, w9
+; CHECK-SD-NEXT:    sxtb w11, w2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    add w8, w8, w13
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    asr w14, w8, #2
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    asr w15, w10, #2
+; CHECK-SD-NEXT:    asr w16, w9, #2
+; CHECK-SD-NEXT:    add w8, w14, w8, lsr #31
+; CHECK-SD-NEXT:    add w10, w15, w10, lsr #31
+; CHECK-SD-NEXT:    add w9, w16, w9, lsr #31
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    add w0, w13, w8
+; CHECK-SD-NEXT:    add w1, w12, w10
+; CHECK-SD-NEXT:    add w2, w11, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8_7:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w13, w2
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w12, w11, w9
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w0, w8, w10
+; CHECK-GI-NEXT:    sdiv w9, w13, w9
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w12, w15, w12
+; CHECK-GI-NEXT:    sub w1, w11, w12
+; CHECK-GI-NEXT:    lsl w16, w9, #3
+; CHECK-GI-NEXT:    sub w9, w16, w9
+; CHECK-GI-NEXT:    sub w2, w13, w9
+; CHECK-GI-NEXT:    ret
+  %s = srem <3 x i8> %d, <i8 7, i8 7, i8 7>
+  ret <3 x i8> %s
+}
+
+define <3 x i8> @sv3i8_100(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: sv3i8_100:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    sxtb x8, w0
+; CHECK-SD-NEXT:    mov w9, #34079 // =0x851f
+; CHECK-SD-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    sxtb x10, w1
+; CHECK-SD-NEXT:    movk w9, #20971, lsl #16
+; CHECK-SD-NEXT:    sxtb x11, w2
+; CHECK-SD-NEXT:    sxtb w12, w0
+; CHECK-SD-NEXT:    smull x8, w8, w9
+; CHECK-SD-NEXT:    smull x10, w10, w9
+; CHECK-SD-NEXT:    smull x9, w11, w9
+; CHECK-SD-NEXT:    mov w11, #100 // =0x64
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    msub w0, w8, w11, w12
+; CHECK-SD-NEXT:    sxtb w8, w1
+; CHECK-SD-NEXT:    msub w1, w10, w11, w8
+; CHECK-SD-NEXT:    sxtb w8, w2
+; CHECK-SD-NEXT:    msub w2, w9, w11, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i8_100:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w11, w1
+; CHECK-GI-NEXT:    sxtb w13, w2
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w12, w11, w9
+; CHECK-GI-NEXT:    msub w0, w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w14, w13, w9
+; CHECK-GI-NEXT:    msub w1, w12, w9, w11
+; CHECK-GI-NEXT:    msub w2, w14, w9, w13
+; CHECK-GI-NEXT:    ret
+  %s = srem <3 x i8> %d, <i8 100, i8 100, i8 100>
+  ret <3 x i8> %s
+}
+
+define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: sv4i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    smov w11, v0.h[0]
+; CHECK-SD-NEXT:    smov x12, v0.h[2]
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smov x14, v0.h[3]
+; CHECK-SD-NEXT:    smov w16, v0.h[2]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    smull x8, w14, w8
+; CHECK-SD-NEXT:    smov w14, v0.h[3]
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    asr w15, w9, #2
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    asr w17, w10, #2
+; CHECK-SD-NEXT:    add w12, w12, w16
+; CHECK-SD-NEXT:    add w9, w15, w9, lsr #31
+; CHECK-SD-NEXT:    asr w15, w12, #2
+; CHECK-SD-NEXT:    add w8, w8, w14
+; CHECK-SD-NEXT:    add w10, w17, w10, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    add w9, w11, w9
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w13, w10
+; CHECK-SD-NEXT:    add w9, w15, w12, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    asr w10, w8, #2
+; CHECK-SD-NEXT:    add w9, w16, w9
+; CHECK-SD-NEXT:    add w8, w10, w8, lsr #31
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w8, w14, w8
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v3.4h, #7
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w9, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
+  ret <4 x i8> %s
+}
+
+define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: sv4i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    sshr v1.4h, v0.4h, #8
+; CHECK-SD-NEXT:    smov x9, v1.h[0]
+; CHECK-SD-NEXT:    smov x10, v1.h[1]
+; CHECK-SD-NEXT:    smov x11, v1.h[2]
+; CHECK-SD-NEXT:    smov w12, v1.h[0]
+; CHECK-SD-NEXT:    smov x13, v1.h[3]
+; CHECK-SD-NEXT:    smov w15, v1.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    smull x8, w13, w8
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    msub w9, w9, w14, w12
+; CHECK-SD-NEXT:    msub w10, w10, w14, w15
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w9, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w11, v1.h[2]
+; CHECK-SD-NEXT:    msub w9, w9, w14, w11
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    smov w10, v1.h[3]
+; CHECK-SD-NEXT:    msub w8, w8, w14, w10
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v3.4h, #100
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w9, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
+  ret <4 x i8> %s
+}
+
+define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: sv8i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #147
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    add v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT:    sshr v1.8b, v1.8b, #2
+; CHECK-SD-NEXT:    usra v1.8b, v1.8b, #7
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v4.8b, #7
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <8 x i8> %s
+}
+
+define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: sv8i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #41
+; CHECK-SD-NEXT:    movi v2.8b, #100
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    sshr v1.8b, v1.8b, #4
+; CHECK-SD-NEXT:    usra v1.8b, v1.8b, #7
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v4.8b, #100
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-SD-LABEL: sv16i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.16b, #147
+; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    movi v2.16b, #7
+; CHECK-SD-NEXT:    add v1.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    sshr v1.16b, v1.16b, #2
+; CHECK-SD-NEXT:    usra v1.16b, v1.16b, #7
+; CHECK-SD-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v16.8b, #7
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    fmov w17, s0
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w18, v0.s[1]
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    mov w1, v0.s[3]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[3]
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-SD-LABEL: sv16i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.16b, #41
+; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    movi v2.16b, #100
+; CHECK-SD-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-SD-NEXT:    usra v1.16b, v1.16b, #7
+; CHECK-SD-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v16.8b, #100
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    fmov w17, s0
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w18, v0.s[1]
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    mov w1, v0.s[3]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[2]
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[3]
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <16 x i8> %s
+}
+
+define <32 x i8> @sv32i8_7(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: sv32i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -48
+; CHECK-SD-NEXT:    smov x10, v0.b[0]
+; CHECK-SD-NEXT:    smov x9, v0.b[1]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov w17, v0.b[0]
+; CHECK-SD-NEXT:    smov w15, v0.b[1]
+; CHECK-SD-NEXT:    smov x11, v0.b[2]
+; CHECK-SD-NEXT:    smov x13, v0.b[3]
+; CHECK-SD-NEXT:    smov x18, v0.b[4]
+; CHECK-SD-NEXT:    smov w14, v0.b[2]
+; CHECK-SD-NEXT:    smov x1, v0.b[5]
+; CHECK-SD-NEXT:    smov w16, v0.b[3]
+; CHECK-SD-NEXT:    smull x2, w10, w8
+; CHECK-SD-NEXT:    smov x3, v0.b[6]
+; CHECK-SD-NEXT:    smov w12, v0.b[4]
+; CHECK-SD-NEXT:    smull x0, w9, w8
+; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    smov x5, v0.b[7]
+; CHECK-SD-NEXT:    smull x4, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.b[5]
+; CHECK-SD-NEXT:    smov w10, v0.b[6]
+; CHECK-SD-NEXT:    lsr x2, x2, #32
+; CHECK-SD-NEXT:    smull x13, w13, w8
+; CHECK-SD-NEXT:    smov w9, v0.b[7]
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    smov x6, v0.b[8]
+; CHECK-SD-NEXT:    add w2, w2, w17
+; CHECK-SD-NEXT:    lsr x4, x4, #32
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    add w0, w0, w15
+; CHECK-SD-NEXT:    asr w19, w2, #2
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    asr w7, w0, #2
+; CHECK-SD-NEXT:    add w4, w4, w14
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    add w2, w19, w2, lsr #31
+; CHECK-SD-NEXT:    smull x3, w3, w8
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    add w0, w7, w0, lsr #31
+; CHECK-SD-NEXT:    asr w7, w4, #2
+; CHECK-SD-NEXT:    add w13, w13, w16
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    add w18, w18, w12
+; CHECK-SD-NEXT:    asr w19, w13, #2
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w4, w7, w4, lsr #31
+; CHECK-SD-NEXT:    asr w20, w18, #2
+; CHECK-SD-NEXT:    add w17, w17, w2
+; CHECK-SD-NEXT:    smull x5, w5, w8
+; CHECK-SD-NEXT:    add w1, w1, w11
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    smov x0, v0.b[9]
+; CHECK-SD-NEXT:    fmov s0, w17
+; CHECK-SD-NEXT:    sub w4, w4, w4, lsl #3
+; CHECK-SD-NEXT:    add w7, w19, w13, lsr #31
+; CHECK-SD-NEXT:    add w13, w20, w18, lsr #31
+; CHECK-SD-NEXT:    lsr x18, x3, #32
+; CHECK-SD-NEXT:    asr w2, w1, #2
+; CHECK-SD-NEXT:    smull x3, w6, w8
+; CHECK-SD-NEXT:    mov v0.b[1], w15
+; CHECK-SD-NEXT:    smov x6, v2.b[10]
+; CHECK-SD-NEXT:    add w14, w14, w4
+; CHECK-SD-NEXT:    lsr x17, x5, #32
+; CHECK-SD-NEXT:    add w5, w18, w10
+; CHECK-SD-NEXT:    add w15, w2, w1, lsr #31
+; CHECK-SD-NEXT:    smull x1, w0, w8
+; CHECK-SD-NEXT:    sub w7, w7, w7, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    smov w18, v2.b[8]
+; CHECK-SD-NEXT:    smov w0, v2.b[9]
+; CHECK-SD-NEXT:    add w17, w17, w9
+; CHECK-SD-NEXT:    mov v0.b[2], w14
+; CHECK-SD-NEXT:    asr w14, w5, #2
+; CHECK-SD-NEXT:    add w12, w12, w13
+; CHECK-SD-NEXT:    lsr x19, x1, #32
+; CHECK-SD-NEXT:    lsr x3, x3, #32
+; CHECK-SD-NEXT:    smov x13, v1.b[0]
+; CHECK-SD-NEXT:    add w1, w14, w5, lsr #31
+; CHECK-SD-NEXT:    smull x5, w6, w8
+; CHECK-SD-NEXT:    add w6, w16, w7
+; CHECK-SD-NEXT:    smov x7, v1.b[1]
+; CHECK-SD-NEXT:    smov w14, v2.b[10]
+; CHECK-SD-NEXT:    asr w2, w17, #2
+; CHECK-SD-NEXT:    mov v0.b[3], w6
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w3, w3, w18
+; CHECK-SD-NEXT:    lsr x5, x5, #32
+; CHECK-SD-NEXT:    asr w4, w3, #2
+; CHECK-SD-NEXT:    add w16, w19, w0
+; CHECK-SD-NEXT:    smov x19, v2.b[11]
+; CHECK-SD-NEXT:    add w20, w2, w17, lsr #31
+; CHECK-SD-NEXT:    add w11, w11, w15
+; CHECK-SD-NEXT:    add w2, w5, w14
+; CHECK-SD-NEXT:    smov w5, v1.b[1]
+; CHECK-SD-NEXT:    smull x15, w13, w8
+; CHECK-SD-NEXT:    mov v0.b[4], w12
+; CHECK-SD-NEXT:    smull x12, w7, w8
+; CHECK-SD-NEXT:    add w21, w4, w3, lsr #31
+; CHECK-SD-NEXT:    sub w7, w1, w1, lsl #3
+; CHECK-SD-NEXT:    smov x22, v1.b[2]
+; CHECK-SD-NEXT:    asr w6, w16, #2
+; CHECK-SD-NEXT:    smull x4, w19, w8
+; CHECK-SD-NEXT:    sub w19, w20, w20, lsl #3
+; CHECK-SD-NEXT:    sub w20, w21, w21, lsl #3
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    lsr x21, x15, #32
+; CHECK-SD-NEXT:    add w15, w10, w7
+; CHECK-SD-NEXT:    mov v0.b[5], w11
+; CHECK-SD-NEXT:    smov w11, v1.b[0]
+; CHECK-SD-NEXT:    add w6, w6, w16, lsr #31
+; CHECK-SD-NEXT:    add w12, w12, w5
+; CHECK-SD-NEXT:    add w9, w9, w19
+; CHECK-SD-NEXT:    smull x19, w22, w8
+; CHECK-SD-NEXT:    asr w7, w12, #2
+; CHECK-SD-NEXT:    smov x22, v1.b[3]
+; CHECK-SD-NEXT:    sub w23, w6, w6, lsl #3
+; CHECK-SD-NEXT:    add w20, w18, w20
+; CHECK-SD-NEXT:    smov w6, v1.b[2]
+; CHECK-SD-NEXT:    smov w17, v2.b[11]
+; CHECK-SD-NEXT:    mov v0.b[6], w15
+; CHECK-SD-NEXT:    add w21, w21, w11
+; CHECK-SD-NEXT:    add w7, w7, w12, lsr #31
+; CHECK-SD-NEXT:    asr w12, w21, #2
+; CHECK-SD-NEXT:    lsr x19, x19, #32
+; CHECK-SD-NEXT:    smov x1, v2.b[12]
+; CHECK-SD-NEXT:    sub w7, w7, w7, lsl #3
+; CHECK-SD-NEXT:    smov w13, v2.b[12]
+; CHECK-SD-NEXT:    smov x16, v2.b[13]
+; CHECK-SD-NEXT:    add w18, w12, w21, lsr #31
+; CHECK-SD-NEXT:    smov w10, v2.b[13]
+; CHECK-SD-NEXT:    smov x15, v2.b[14]
+; CHECK-SD-NEXT:    mov v0.b[7], w9
+; CHECK-SD-NEXT:    add w5, w5, w7
+; CHECK-SD-NEXT:    add w7, w19, w6
+; CHECK-SD-NEXT:    sub w21, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w18, w0, w23
+; CHECK-SD-NEXT:    smull x0, w22, w8
+; CHECK-SD-NEXT:    smov x22, v1.b[4]
+; CHECK-SD-NEXT:    smov w19, v1.b[3]
+; CHECK-SD-NEXT:    smov w9, v2.b[14]
+; CHECK-SD-NEXT:    smov x12, v2.b[15]
+; CHECK-SD-NEXT:    asr w3, w2, #2
+; CHECK-SD-NEXT:    lsr x4, x4, #32
+; CHECK-SD-NEXT:    mov v0.b[8], w20
+; CHECK-SD-NEXT:    add w20, w11, w21
+; CHECK-SD-NEXT:    asr w21, w7, #2
+; CHECK-SD-NEXT:    smov w11, v2.b[15]
+; CHECK-SD-NEXT:    fmov s2, w20
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    add w7, w21, w7, lsr #31
+; CHECK-SD-NEXT:    smull x20, w22, w8
+; CHECK-SD-NEXT:    smov x21, v1.b[5]
+; CHECK-SD-NEXT:    add w0, w0, w19
+; CHECK-SD-NEXT:    add w2, w3, w2, lsr #31
+; CHECK-SD-NEXT:    smov x22, v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[1], w5
+; CHECK-SD-NEXT:    smov w5, v1.b[4]
+; CHECK-SD-NEXT:    sub w3, w7, w7, lsl #3
+; CHECK-SD-NEXT:    asr w7, w0, #2
+; CHECK-SD-NEXT:    lsr x20, x20, #32
+; CHECK-SD-NEXT:    add w4, w4, w17
+; CHECK-SD-NEXT:    add w3, w6, w3
+; CHECK-SD-NEXT:    smull x6, w21, w8
+; CHECK-SD-NEXT:    asr w21, w4, #2
+; CHECK-SD-NEXT:    add w0, w7, w0, lsr #31
+; CHECK-SD-NEXT:    smov w7, v1.b[5]
+; CHECK-SD-NEXT:    smull x22, w22, w8
+; CHECK-SD-NEXT:    mov v2.b[2], w3
+; CHECK-SD-NEXT:    add w3, w20, w5
+; CHECK-SD-NEXT:    add w4, w21, w4, lsr #31
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    asr w20, w3, #2
+; CHECK-SD-NEXT:    lsr x6, x6, #32
+; CHECK-SD-NEXT:    lsr x22, x22, #32
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    smov x21, v1.b[8]
+; CHECK-SD-NEXT:    add w19, w19, w0
+; CHECK-SD-NEXT:    add w3, w20, w3, lsr #31
+; CHECK-SD-NEXT:    smov w0, v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[3], w19
+; CHECK-SD-NEXT:    smov x19, v1.b[7]
+; CHECK-SD-NEXT:    add w6, w6, w7
+; CHECK-SD-NEXT:    sub w3, w3, w3, lsl #3
+; CHECK-SD-NEXT:    asr w20, w6, #2
+; CHECK-SD-NEXT:    mov v0.b[9], w18
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    add w14, w14, w2
+; CHECK-SD-NEXT:    sub w2, w4, w4, lsl #3
+; CHECK-SD-NEXT:    add w3, w5, w3
+; CHECK-SD-NEXT:    add w5, w20, w6, lsr #31
+; CHECK-SD-NEXT:    add w6, w22, w0
+; CHECK-SD-NEXT:    smull x19, w19, w8
+; CHECK-SD-NEXT:    mov v2.b[4], w3
+; CHECK-SD-NEXT:    smov w20, v1.b[7]
+; CHECK-SD-NEXT:    asr w3, w6, #2
+; CHECK-SD-NEXT:    sub w5, w5, w5, lsl #3
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    mov v0.b[10], w14
+; CHECK-SD-NEXT:    smov w14, v1.b[8]
+; CHECK-SD-NEXT:    add w17, w17, w2
+; CHECK-SD-NEXT:    lsr x4, x19, #32
+; CHECK-SD-NEXT:    add w18, w3, w6, lsr #31
+; CHECK-SD-NEXT:    add w3, w7, w5
+; CHECK-SD-NEXT:    mov v2.b[5], w3
+; CHECK-SD-NEXT:    smov x5, v1.b[9]
+; CHECK-SD-NEXT:    add w1, w1, w13
+; CHECK-SD-NEXT:    add w3, w4, w20
+; CHECK-SD-NEXT:    smull x4, w21, w8
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    asr w6, w3, #2
+; CHECK-SD-NEXT:    mov v0.b[11], w17
+; CHECK-SD-NEXT:    smull x16, w16, w8
+; CHECK-SD-NEXT:    add w18, w0, w18
+; CHECK-SD-NEXT:    asr w0, w1, #2
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    add w2, w6, w3, lsr #31
+; CHECK-SD-NEXT:    lsr x3, x4, #32
+; CHECK-SD-NEXT:    mov v2.b[6], w18
+; CHECK-SD-NEXT:    smull x18, w5, w8
+; CHECK-SD-NEXT:    smov x4, v1.b[10]
+; CHECK-SD-NEXT:    smov w5, v1.b[9]
+; CHECK-SD-NEXT:    add w3, w3, w14
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    add w17, w0, w1, lsr #31
+; CHECK-SD-NEXT:    asr w0, w3, #2
+; CHECK-SD-NEXT:    smov x6, v1.b[12]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    add w1, w20, w2
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w3, lsr #31
+; CHECK-SD-NEXT:    smull x2, w4, w8
+; CHECK-SD-NEXT:    smov x3, v1.b[11]
+; CHECK-SD-NEXT:    mov v2.b[7], w1
+; CHECK-SD-NEXT:    add w18, w18, w5
+; CHECK-SD-NEXT:    smov w1, v1.b[10]
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    asr w4, w18, #2
+; CHECK-SD-NEXT:    smull x6, w6, w8
+; CHECK-SD-NEXT:    lsr x2, x2, #32
+; CHECK-SD-NEXT:    add w16, w16, w10
+; CHECK-SD-NEXT:    add w13, w13, w17
+; CHECK-SD-NEXT:    add w14, w14, w0
+; CHECK-SD-NEXT:    add w18, w4, w18, lsr #31
+; CHECK-SD-NEXT:    smull x0, w3, w8
+; CHECK-SD-NEXT:    mov v2.b[8], w14
+; CHECK-SD-NEXT:    add w14, w2, w1
+; CHECK-SD-NEXT:    smov w2, v1.b[11]
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    asr w3, w14, #2
+; CHECK-SD-NEXT:    asr w4, w16, #2
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    lsr x6, x6, #32
+; CHECK-SD-NEXT:    mov v0.b[12], w13
+; CHECK-SD-NEXT:    add w18, w5, w18
+; CHECK-SD-NEXT:    add w14, w3, w14, lsr #31
+; CHECK-SD-NEXT:    smov w3, v1.b[12]
+; CHECK-SD-NEXT:    mov v2.b[9], w18
+; CHECK-SD-NEXT:    add w18, w0, w2
+; CHECK-SD-NEXT:    smov x0, v1.b[13]
+; CHECK-SD-NEXT:    asr w5, w18, #2
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    add w16, w4, w16, lsr #31
+; CHECK-SD-NEXT:    smov x4, v1.b[14]
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    add w17, w5, w18, lsr #31
+; CHECK-SD-NEXT:    add w14, w1, w14
+; CHECK-SD-NEXT:    add w18, w6, w3
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    asr w1, w18, #2
+; CHECK-SD-NEXT:    mov v2.b[10], w14
+; CHECK-SD-NEXT:    sub w14, w17, w17, lsl #3
+; CHECK-SD-NEXT:    smov w17, v1.b[13]
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    add w13, w1, w18, lsr #31
+; CHECK-SD-NEXT:    smov x1, v1.b[15]
+; CHECK-SD-NEXT:    add w15, w15, w9
+; CHECK-SD-NEXT:    lsr x18, x0, #32
+; CHECK-SD-NEXT:    add w14, w2, w14
+; CHECK-SD-NEXT:    smull x0, w4, w8
+; CHECK-SD-NEXT:    mov v2.b[11], w14
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    add w10, w10, w16
+; CHECK-SD-NEXT:    add w14, w18, w17
+; CHECK-SD-NEXT:    smov w18, v1.b[14]
+; CHECK-SD-NEXT:    mov v0.b[13], w10
+; CHECK-SD-NEXT:    asr w16, w14, #2
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    add w13, w3, w13
+; CHECK-SD-NEXT:    smull x8, w1, w8
+; CHECK-SD-NEXT:    lsr x10, x12, #32
+; CHECK-SD-NEXT:    add w14, w16, w14, lsr #31
+; CHECK-SD-NEXT:    asr w16, w15, #2
+; CHECK-SD-NEXT:    mov v2.b[12], w13
+; CHECK-SD-NEXT:    add w13, w0, w18
+; CHECK-SD-NEXT:    smov w0, v1.b[15]
+; CHECK-SD-NEXT:    add w10, w10, w11
+; CHECK-SD-NEXT:    add w15, w16, w15, lsr #31
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    asr w16, w13, #2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add w12, w17, w14
+; CHECK-SD-NEXT:    add w13, w16, w13, lsr #31
+; CHECK-SD-NEXT:    sub w14, w15, w15, lsl #3
+; CHECK-SD-NEXT:    mov v2.b[13], w12
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w12, w10, #2
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    asr w15, w8, #2
+; CHECK-SD-NEXT:    add w9, w9, w14
+; CHECK-SD-NEXT:    add w10, w12, w10, lsr #31
+; CHECK-SD-NEXT:    mov v0.b[14], w9
+; CHECK-SD-NEXT:    add w12, w18, w13
+; CHECK-SD-NEXT:    add w8, w15, w8, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[14], w12
+; CHECK-SD-NEXT:    sub w9, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add w9, w11, w9
+; CHECK-SD-NEXT:    add w8, w0, w8
+; CHECK-SD-NEXT:    mov v0.b[15], w9
+; CHECK-SD-NEXT:    mov v2.b[15], w8
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv32i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -88
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshll v4.8h, v1.8b, #0
+; CHECK-GI-NEXT:    movi v20.8b, #7
+; CHECK-GI-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v20.8h, v20.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w4, s5
+; CHECK-GI-NEXT:    fmov w19, s4
+; CHECK-GI-NEXT:    sshll v23.4s, v20.4h, #0
+; CHECK-GI-NEXT:    sshll2 v20.4s, v20.8h, #0
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[1]
+; CHECK-GI-NEXT:    sdiv w17, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[1]
+; CHECK-GI-NEXT:    fmov s6, w12
+; CHECK-GI-NEXT:    sdiv w13, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[2]
+; CHECK-GI-NEXT:    fmov s7, w17
+; CHECK-GI-NEXT:    sdiv w18, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[2]
+; CHECK-GI-NEXT:    mov v6.s[1], w13
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov v7.s[1], w18
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v24.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sdiv w16, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v6.s[2], w11
+; CHECK-GI-NEXT:    fmov w14, s2
+; CHECK-GI-NEXT:    sdiv w23, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[1]
+; CHECK-GI-NEXT:    mov v7.s[2], w16
+; CHECK-GI-NEXT:    sdiv w20, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[1]
+; CHECK-GI-NEXT:    fmov s19, w23
+; CHECK-GI-NEXT:    sdiv w24, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[2]
+; CHECK-GI-NEXT:    fmov s18, w20
+; CHECK-GI-NEXT:    sdiv w2, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov v19.s[1], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w21, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[2]
+; CHECK-GI-NEXT:    fmov s16, w2
+; CHECK-GI-NEXT:    sdiv w22, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[3]
+; CHECK-GI-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-GI-NEXT:    mov v18.s[1], w21
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v25.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sdiv w1, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[2]
+; CHECK-GI-NEXT:    mov v19.s[2], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w7, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NEXT:    fmov w25, s5
+; CHECK-GI-NEXT:    mov w26, v5.s[1]
+; CHECK-GI-NEXT:    mov w27, v5.s[2]
+; CHECK-GI-NEXT:    mov w28, v5.s[3]
+; CHECK-GI-NEXT:    sshll2 v5.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    fmov w29, s5
+; CHECK-GI-NEXT:    mov w30, v5.s[1]
+; CHECK-GI-NEXT:    mov w11, v5.s[3]
+; CHECK-GI-NEXT:    sdiv w15, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v3.8h, #0
+; CHECK-GI-NEXT:    mov v18.s[2], w7
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    fmov w0, s2
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v16.s[2], w15
+; CHECK-GI-NEXT:    sdiv w6, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[1]
+; CHECK-GI-NEXT:    str w9, [sp, #12] // 4-byte Folded Spill
+; CHECK-GI-NEXT:    mov w9, v5.s[2]
+; CHECK-GI-NEXT:    sdiv w25, w25, w8
+; CHECK-GI-NEXT:    fmov s17, w6
+; CHECK-GI-NEXT:    sdiv w29, w29, w8
+; CHECK-GI-NEXT:    fmov s21, w25
+; CHECK-GI-NEXT:    sdiv w5, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[2]
+; CHECK-GI-NEXT:    fmov s22, w29
+; CHECK-GI-NEXT:    sdiv w26, w26, w8
+; CHECK-GI-NEXT:    mov v17.s[1], w5
+; CHECK-GI-NEXT:    sdiv w30, w30, w8
+; CHECK-GI-NEXT:    mov v21.s[1], w26
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w3, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[3]
+; CHECK-GI-NEXT:    mov v22.s[1], w30
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w27, w27, w8
+; CHECK-GI-NEXT:    mov v17.s[2], w3
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v21.s[2], w27
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    mov v22.s[2], w9
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v7.s[3], w10
+; CHECK-GI-NEXT:    mls v0.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v16.s[3], w14
+; CHECK-GI-NEXT:    mls v3.4s, v16.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v17.s[3], w0
+; CHECK-GI-NEXT:    mls v2.4s, v17.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w19, w19, w8
+; CHECK-GI-NEXT:    mov v18.s[3], w4
+; CHECK-GI-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-GI-NEXT:    mls v25.4s, v18.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w28, w28, w8
+; CHECK-GI-NEXT:    mov v19.s[3], w19
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mls v1.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w8, w11, w8
+; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; CHECK-GI-NEXT:    mov v21.s[3], w28
+; CHECK-GI-NEXT:    uzp1 v1.8h, v25.8h, v1.8h
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v6.s[3], w11
+; CHECK-GI-NEXT:    mls v4.4s, v21.4s, v23.4s
+; CHECK-GI-NEXT:    mls v24.4s, v6.4s, v23.4s
+; CHECK-GI-NEXT:    mov v22.s[3], w8
+; CHECK-GI-NEXT:    uzp1 v0.8h, v24.8h, v0.8h
+; CHECK-GI-NEXT:    mls v5.4s, v22.4s, v20.4s
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <32 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <32 x i8> %s
+}
+
+define <32 x i8> @sv32i8_100(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: sv32i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smov x10, v0.b[1]
+; CHECK-SD-NEXT:    smov x11, v0.b[0]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x12, v0.b[2]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x14, v0.b[3]
+; CHECK-SD-NEXT:    smov x17, v0.b[5]
+; CHECK-SD-NEXT:    smov x15, v0.b[4]
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    smov w2, v0.b[1]
+; CHECK-SD-NEXT:    smov w13, v0.b[2]
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x16, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.b[0]
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    asr x0, x10, #37
+; CHECK-SD-NEXT:    smull x18, w14, w8
+; CHECK-SD-NEXT:    smov w14, v0.b[3]
+; CHECK-SD-NEXT:    asr x1, x16, #37
+; CHECK-SD-NEXT:    smull x16, w17, w8
+; CHECK-SD-NEXT:    smov w10, v0.b[4]
+; CHECK-SD-NEXT:    asr x12, x12, #37
+; CHECK-SD-NEXT:    add w3, w0, w0, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    add w17, w1, w1, lsr #31
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    smov x0, v0.b[6]
+; CHECK-SD-NEXT:    add w1, w12, w12, lsr #31
+; CHECK-SD-NEXT:    msub w12, w3, w9, w2
+; CHECK-SD-NEXT:    smov x2, v1.b[0]
+; CHECK-SD-NEXT:    msub w11, w17, w9, w11
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    smov x3, v1.b[1]
+; CHECK-SD-NEXT:    asr x15, x15, #37
+; CHECK-SD-NEXT:    asr x16, x16, #37
+; CHECK-SD-NEXT:    msub w1, w1, w9, w13
+; CHECK-SD-NEXT:    msub w13, w18, w9, w14
+; CHECK-SD-NEXT:    smov w17, v0.b[5]
+; CHECK-SD-NEXT:    smov x14, v0.b[7]
+; CHECK-SD-NEXT:    smull x18, w0, w8
+; CHECK-SD-NEXT:    fmov s2, w11
+; CHECK-SD-NEXT:    add w15, w15, w15, lsr #31
+; CHECK-SD-NEXT:    smull x0, w2, w8
+; CHECK-SD-NEXT:    add w16, w16, w16, lsr #31
+; CHECK-SD-NEXT:    smov x11, v1.b[2]
+; CHECK-SD-NEXT:    smull x2, w3, w8
+; CHECK-SD-NEXT:    smov x3, v0.b[8]
+; CHECK-SD-NEXT:    mov v2.b[1], w12
+; CHECK-SD-NEXT:    msub w15, w15, w9, w10
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    msub w10, w16, w9, w17
+; CHECK-SD-NEXT:    asr x17, x0, #37
+; CHECK-SD-NEXT:    smov w16, v1.b[0]
+; CHECK-SD-NEXT:    asr x0, x2, #37
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smov w12, v0.b[6]
+; CHECK-SD-NEXT:    add w17, w17, w17, lsr #31
+; CHECK-SD-NEXT:    smull x14, w14, w8
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[2], w1
+; CHECK-SD-NEXT:    smov x1, v1.b[3]
+; CHECK-SD-NEXT:    add w0, w0, w0, lsr #31
+; CHECK-SD-NEXT:    msub w16, w17, w9, w16
+; CHECK-SD-NEXT:    smov w17, v1.b[1]
+; CHECK-SD-NEXT:    asr x2, x11, #37
+; CHECK-SD-NEXT:    msub w11, w18, w9, w12
+; CHECK-SD-NEXT:    asr x12, x14, #37
+; CHECK-SD-NEXT:    smov x18, v1.b[4]
+; CHECK-SD-NEXT:    msub w14, w0, w9, w17
+; CHECK-SD-NEXT:    add w17, w2, w2, lsr #31
+; CHECK-SD-NEXT:    fmov s3, w16
+; CHECK-SD-NEXT:    smull x0, w1, w8
+; CHECK-SD-NEXT:    smov w16, v1.b[2]
+; CHECK-SD-NEXT:    mov v2.b[3], w13
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    smull x13, w3, w8
+; CHECK-SD-NEXT:    msub w16, w17, w9, w16
+; CHECK-SD-NEXT:    mov v3.b[1], w14
+; CHECK-SD-NEXT:    smov w14, v0.b[7]
+; CHECK-SD-NEXT:    asr x17, x0, #37
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    smov x0, v1.b[5]
+; CHECK-SD-NEXT:    asr x13, x13, #37
+; CHECK-SD-NEXT:    msub w12, w12, w9, w14
+; CHECK-SD-NEXT:    smov w14, v1.b[3]
+; CHECK-SD-NEXT:    add w17, w17, w17, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[4], w15
+; CHECK-SD-NEXT:    smov w15, v0.b[8]
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    mov v3.b[2], w16
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    msub w14, w17, w9, w14
+; CHECK-SD-NEXT:    smov x17, v1.b[6]
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    smov x16, v0.b[9]
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    msub w13, w13, w9, w15
+; CHECK-SD-NEXT:    smov x15, v0.b[10]
+; CHECK-SD-NEXT:    mov v2.b[5], w10
+; CHECK-SD-NEXT:    smov w10, v1.b[4]
+; CHECK-SD-NEXT:    mov v3.b[3], w14
+; CHECK-SD-NEXT:    asr x14, x0, #37
+; CHECK-SD-NEXT:    smov x0, v1.b[7]
+; CHECK-SD-NEXT:    msub w10, w18, w9, w10
+; CHECK-SD-NEXT:    smov w18, v1.b[5]
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    add w14, w14, w14, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[6], w11
+; CHECK-SD-NEXT:    smull x16, w16, w8
+; CHECK-SD-NEXT:    smov x11, v0.b[11]
+; CHECK-SD-NEXT:    mov v3.b[4], w10
+; CHECK-SD-NEXT:    msub w14, w14, w9, w18
+; CHECK-SD-NEXT:    smov x18, v1.b[8]
+; CHECK-SD-NEXT:    asr x10, x17, #37
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    smov w17, v0.b[9]
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    asr x16, x16, #37
+; CHECK-SD-NEXT:    mov v2.b[7], w12
+; CHECK-SD-NEXT:    smov w12, v1.b[6]
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    mov v3.b[5], w14
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    add w16, w16, w16, lsr #31
+; CHECK-SD-NEXT:    msub w10, w10, w9, w12
+; CHECK-SD-NEXT:    asr x12, x0, #37
+; CHECK-SD-NEXT:    asr x14, x15, #37
+; CHECK-SD-NEXT:    smull x15, w18, w8
+; CHECK-SD-NEXT:    smov x18, v1.b[9]
+; CHECK-SD-NEXT:    smov w0, v1.b[7]
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    add w14, w14, w14, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[8], w13
+; CHECK-SD-NEXT:    mov v3.b[6], w10
+; CHECK-SD-NEXT:    smov w10, v0.b[10]
+; CHECK-SD-NEXT:    msub w16, w16, w9, w17
+; CHECK-SD-NEXT:    msub w12, w12, w9, w0
+; CHECK-SD-NEXT:    asr x13, x15, #37
+; CHECK-SD-NEXT:    smov x17, v1.b[10]
+; CHECK-SD-NEXT:    smull x15, w18, w8
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    smov x18, v0.b[12]
+; CHECK-SD-NEXT:    msub w10, w14, w9, w10
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[8]
+; CHECK-SD-NEXT:    mov v3.b[7], w12
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w12, v0.b[11]
+; CHECK-SD-NEXT:    msub w13, w13, w9, w14
+; CHECK-SD-NEXT:    asr x14, x15, #37
+; CHECK-SD-NEXT:    smov x0, v0.b[13]
+; CHECK-SD-NEXT:    smull x15, w17, w8
+; CHECK-SD-NEXT:    smov x17, v1.b[11]
+; CHECK-SD-NEXT:    mov v2.b[9], w16
+; CHECK-SD-NEXT:    msub w11, w11, w9, w12
+; CHECK-SD-NEXT:    add w12, w14, w14, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[9]
+; CHECK-SD-NEXT:    mov v3.b[8], w13
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    msub w12, w12, w9, w14
+; CHECK-SD-NEXT:    asr x13, x15, #37
+; CHECK-SD-NEXT:    smov x15, v1.b[12]
+; CHECK-SD-NEXT:    smull x14, w17, w8
+; CHECK-SD-NEXT:    smov w17, v1.b[10]
+; CHECK-SD-NEXT:    mov v2.b[10], w10
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    asr x16, x18, #37
+; CHECK-SD-NEXT:    smull x18, w0, w8
+; CHECK-SD-NEXT:    mov v3.b[9], w12
+; CHECK-SD-NEXT:    smov x12, v1.b[13]
+; CHECK-SD-NEXT:    smov x0, v0.b[14]
+; CHECK-SD-NEXT:    msub w10, w13, w9, w17
+; CHECK-SD-NEXT:    asr x13, x14, #37
+; CHECK-SD-NEXT:    add w14, w16, w16, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    asr x17, x18, #37
+; CHECK-SD-NEXT:    smov w18, v1.b[11]
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    smov w16, v0.b[12]
+; CHECK-SD-NEXT:    mov v2.b[11], w11
+; CHECK-SD-NEXT:    mov v3.b[10], w10
+; CHECK-SD-NEXT:    smull x10, w12, w8
+; CHECK-SD-NEXT:    msub w11, w13, w9, w18
+; CHECK-SD-NEXT:    asr x12, x15, #37
+; CHECK-SD-NEXT:    smov x15, v1.b[14]
+; CHECK-SD-NEXT:    msub w13, w14, w9, w16
+; CHECK-SD-NEXT:    smov w16, v1.b[12]
+; CHECK-SD-NEXT:    add w14, w17, w17, lsr #31
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    smov w17, v0.b[13]
+; CHECK-SD-NEXT:    mov v3.b[11], w11
+; CHECK-SD-NEXT:    smull x18, w0, w8
+; CHECK-SD-NEXT:    smov x11, v0.b[15]
+; CHECK-SD-NEXT:    msub w12, w12, w9, w16
+; CHECK-SD-NEXT:    smov x0, v1.b[15]
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    smov w16, v1.b[13]
+; CHECK-SD-NEXT:    mov v2.b[12], w13
+; CHECK-SD-NEXT:    msub w13, w14, w9, w17
+; CHECK-SD-NEXT:    mov v3.b[12], w12
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    asr x12, x18, #37
+; CHECK-SD-NEXT:    asr x14, x15, #37
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smov w15, v0.b[14]
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[13], w13
+; CHECK-SD-NEXT:    add w13, w14, w14, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[14]
+; CHECK-SD-NEXT:    mov v3.b[13], w10
+; CHECK-SD-NEXT:    msub w12, w12, w9, w15
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    msub w10, w13, w9, w14
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    smov w13, v0.b[15]
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.b[15]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    mov v2.b[14], w12
+; CHECK-SD-NEXT:    mov v3.b[14], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w13
+; CHECK-SD-NEXT:    msub w8, w8, w9, w14
+; CHECK-SD-NEXT:    mov v2.b[15], w11
+; CHECK-SD-NEXT:    mov v3.b[15], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv32i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    .cfi_offset w27, -72
+; CHECK-GI-NEXT:    .cfi_offset w28, -80
+; CHECK-GI-NEXT:    .cfi_offset w30, -88
+; CHECK-GI-NEXT:    .cfi_offset w29, -96
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sshll v4.8h, v1.8b, #0
+; CHECK-GI-NEXT:    movi v20.8b, #100
+; CHECK-GI-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v20.8h, v20.8b, #0
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w4, s5
+; CHECK-GI-NEXT:    fmov w19, s4
+; CHECK-GI-NEXT:    sshll v23.4s, v20.4h, #0
+; CHECK-GI-NEXT:    sshll2 v20.4s, v20.8h, #0
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[1]
+; CHECK-GI-NEXT:    sdiv w17, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[1]
+; CHECK-GI-NEXT:    fmov s6, w12
+; CHECK-GI-NEXT:    sdiv w13, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[2]
+; CHECK-GI-NEXT:    fmov s7, w17
+; CHECK-GI-NEXT:    sdiv w18, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[2]
+; CHECK-GI-NEXT:    mov v6.s[1], w13
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mov v7.s[1], w18
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v24.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sdiv w16, w10, w8
+; CHECK-GI-NEXT:    mov w10, v2.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v6.s[2], w11
+; CHECK-GI-NEXT:    fmov w14, s2
+; CHECK-GI-NEXT:    sdiv w23, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[1]
+; CHECK-GI-NEXT:    mov v7.s[2], w16
+; CHECK-GI-NEXT:    sdiv w20, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[1]
+; CHECK-GI-NEXT:    fmov s19, w23
+; CHECK-GI-NEXT:    sdiv w24, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[2]
+; CHECK-GI-NEXT:    fmov s18, w20
+; CHECK-GI-NEXT:    sdiv w2, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov v19.s[1], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w21, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[2]
+; CHECK-GI-NEXT:    fmov s16, w2
+; CHECK-GI-NEXT:    sdiv w22, w19, w8
+; CHECK-GI-NEXT:    mov w19, v4.s[3]
+; CHECK-GI-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-GI-NEXT:    mov v18.s[1], w21
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v25.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sdiv w1, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[2]
+; CHECK-GI-NEXT:    mov v19.s[2], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w7, w4, w8
+; CHECK-GI-NEXT:    mov w4, v5.s[3]
+; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NEXT:    fmov w25, s5
+; CHECK-GI-NEXT:    mov w26, v5.s[1]
+; CHECK-GI-NEXT:    mov w27, v5.s[2]
+; CHECK-GI-NEXT:    mov w28, v5.s[3]
+; CHECK-GI-NEXT:    sshll2 v5.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    fmov w29, s5
+; CHECK-GI-NEXT:    mov w30, v5.s[1]
+; CHECK-GI-NEXT:    mov w11, v5.s[3]
+; CHECK-GI-NEXT:    sdiv w15, w14, w8
+; CHECK-GI-NEXT:    mov w14, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v3.8h, #0
+; CHECK-GI-NEXT:    mov v18.s[2], w7
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    fmov w0, s2
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v16.s[2], w15
+; CHECK-GI-NEXT:    sdiv w6, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[1]
+; CHECK-GI-NEXT:    str w9, [sp, #12] // 4-byte Folded Spill
+; CHECK-GI-NEXT:    mov w9, v5.s[2]
+; CHECK-GI-NEXT:    sdiv w25, w25, w8
+; CHECK-GI-NEXT:    fmov s17, w6
+; CHECK-GI-NEXT:    sdiv w29, w29, w8
+; CHECK-GI-NEXT:    fmov s21, w25
+; CHECK-GI-NEXT:    sdiv w5, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[2]
+; CHECK-GI-NEXT:    fmov s22, w29
+; CHECK-GI-NEXT:    sdiv w26, w26, w8
+; CHECK-GI-NEXT:    mov v17.s[1], w5
+; CHECK-GI-NEXT:    sdiv w30, w30, w8
+; CHECK-GI-NEXT:    mov v21.s[1], w26
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w3, w0, w8
+; CHECK-GI-NEXT:    mov w0, v2.s[3]
+; CHECK-GI-NEXT:    mov v22.s[1], w30
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w27, w27, w8
+; CHECK-GI-NEXT:    mov v17.s[2], w3
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v21.s[2], w27
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    mov v22.s[2], w9
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v7.s[3], w10
+; CHECK-GI-NEXT:    mls v0.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v16.s[3], w14
+; CHECK-GI-NEXT:    mls v3.4s, v16.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v17.s[3], w0
+; CHECK-GI-NEXT:    mls v2.4s, v17.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w19, w19, w8
+; CHECK-GI-NEXT:    mov v18.s[3], w4
+; CHECK-GI-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-GI-NEXT:    mls v25.4s, v18.4s, v23.4s
+; CHECK-GI-NEXT:    sdiv w28, w28, w8
+; CHECK-GI-NEXT:    mov v19.s[3], w19
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mls v1.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT:    sdiv w8, w11, w8
+; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; CHECK-GI-NEXT:    mov v21.s[3], w28
+; CHECK-GI-NEXT:    uzp1 v1.8h, v25.8h, v1.8h
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v6.s[3], w11
+; CHECK-GI-NEXT:    mls v4.4s, v21.4s, v23.4s
+; CHECK-GI-NEXT:    mls v24.4s, v6.4s, v23.4s
+; CHECK-GI-NEXT:    mov v22.s[3], w8
+; CHECK-GI-NEXT:    uzp1 v0.8h, v24.8h, v0.8h
+; CHECK-GI-NEXT:    mls v5.4s, v22.4s, v20.4s
+; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <32 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <32 x i8> %s
+}
+
+define <2 x i8> @uv2i8_7(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: uv2i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi v2.2s, #37
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    and v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    mov v2.b[1], w9
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    umov w8, v2.b[0]
+; CHECK-GI-NEXT:    umov w9, v2.b[1]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w9, v1.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i8> %d, <i8 7, i8 7>
+  ret <2 x i8> %s
+}
+
+define <2 x i8> @uv2i8_100(<2 x i8> %d, <2 x i8> %e) {
+; CHECK-SD-LABEL: uv2i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi v2.2s, #41
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    and v1.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w9, v1.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i8> %d, <i8 100, i8 100>
+  ret <2 x i8> %s
+}
+
+define <3 x i8> @uv3i8_7(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: uv3i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0xff
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    and w12, w2, #0xff
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w0, w9, w11
+; CHECK-SD-NEXT:    add w1, w10, w13
+; CHECK-SD-NEXT:    add w2, w12, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    mov w10, #37 // =0x25
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    and w9, w2, #0xff
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mul v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    mov b3, v2.b[1]
+; CHECK-GI-NEXT:    mov b4, v2.b[2]
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    add v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    mov b2, v1.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[2]
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-NEXT:    umov w2, v0.h[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i8> %d, <i8 7, i8 7, i8 7>
+  ret <3 x i8> %s
+}
+
+define <3 x i8> @uv3i8_100(<3 x i8> %d, <3 x i8> %e) {
+; CHECK-SD-LABEL: uv3i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    and w9, w0, #0xff
+; CHECK-SD-NEXT:    and w10, w1, #0xff
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    and w12, w2, #0xff
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w0, w11, w14, w9
+; CHECK-SD-NEXT:    msub w1, w13, w14, w10
+; CHECK-SD-NEXT:    msub w2, w8, w14, w12
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    mov w10, #41 // =0x29
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    fmov s1, w10
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    and w9, w2, #0xff
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w10
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    neg v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT:    neg v1.8b, v3.8b
+; CHECK-GI-NEXT:    fmov s3, w0
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.h[1], w1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.h[2], w2
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.h[2], w9
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mls v3.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umov w0, v3.h[0]
+; CHECK-GI-NEXT:    umov w1, v3.h[1]
+; CHECK-GI-NEXT:    umov w2, v3.h[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i8> %d, <i8 100, i8 100, i8 100>
+  ret <3 x i8> %s
+}
+
+define <4 x i8> @uv4i8_7(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: uv4i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    umov w13, v0.h[2]
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    umull x14, w13, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    umull x8, w15, w8
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    lsr x9, x14, #32
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    add w8, w15, w8
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #37 // =0x25
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ushr v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    mov v4.b[1], w8
+; CHECK-GI-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    mov v4.b[2], w8
+; CHECK-GI-NEXT:    ushl v2.8b, v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    mov v4.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    usra v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    uzp1 v1.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    dup v2.4h, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
+  ret <4 x i8> %s
+}
+
+define <4 x i8> @uv4i8_100(<4 x i8> %d, <4 x i8> %e) {
+; CHECK-SD-LABEL: uv4i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    msub w9, w11, w14, w9
+; CHECK-SD-NEXT:    umull x11, w12, w8
+; CHECK-SD-NEXT:    msub w10, w13, w14, w10
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    umull x8, w15, w8
+; CHECK-SD-NEXT:    lsr x9, x11, #32
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    msub w9, w9, w14, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w8, w8, w14, w15
+; CHECK-SD-NEXT:    mov v0.h[2], w9
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    ushl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    dup v2.4h, w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
+  ret <4 x i8> %s
+}
+
+define <8 x i8> @uv8i8_7(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-SD-LABEL: uv8i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8b, #37
+; CHECK-SD-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-SD-NEXT:    sub v2.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-SD-NEXT:    shrn v2.8b, v2.8h, #1
+; CHECK-SD-NEXT:    add v1.8b, v2.8b, v1.8b
+; CHECK-SD-NEXT:    movi v2.8b, #7
+; CHECK-SD-NEXT:    ushr v1.8b, v1.8b, #2
+; CHECK-SD-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.8b, #37
+; CHECK-GI-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    sub v2.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usra v1.8b, v2.8b, #1
+; CHECK-GI-NEXT:    movi v2.8b, #7
+; CHECK-GI-NEXT:    ushr v1.8b, v1.8b, #2
+; CHECK-GI-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <8 x i8> %s
+}
+
+define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) {
+; CHECK-LABEL: uv8i8_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.8b, #41
+; CHECK-NEXT:    movi v2.8b, #100
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    ushr v1.8b, v1.8b, #4
+; CHECK-NEXT:    mls v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @uv16i8_7(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: uv16i8_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.16b, #37
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sub v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    usra v1.16b, v2.16b, #1
+; CHECK-NEXT:    movi v2.16b, #7
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #2
+; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %s
+}
+
+define <16 x i8> @uv16i8_100(<16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: uv16i8_100:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.16b, #41
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v2.16b, #100
+; CHECK-NEXT:    ushr v1.16b, v1.16b, #4
+; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %s = urem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <16 x i8> %s
+}
+
+define <32 x i8> @uv32i8_7(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: uv32i8_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w10, v0.b[0]
+; CHECK-SD-NEXT:    umov w13, v1.b[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w9, v0.b[1]
+; CHECK-SD-NEXT:    umov w12, v1.b[1]
+; CHECK-SD-NEXT:    umov w17, v0.b[2]
+; CHECK-SD-NEXT:    umov w0, v1.b[2]
+; CHECK-SD-NEXT:    umov w1, v1.b[3]
+; CHECK-SD-NEXT:    umull x14, w10, w8
+; CHECK-SD-NEXT:    umull x16, w13, w8
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x15, w12, w8
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umull x2, w17, w8
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    lsr x18, x11, #32
+; CHECK-SD-NEXT:    umov w11, v0.b[3]
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    add w10, w10, w14
+; CHECK-SD-NEXT:    umull x14, w0, w8
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w13, w13, w16
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    fmov s2, w10
+; CHECK-SD-NEXT:    fmov s3, w13
+; CHECK-SD-NEXT:    add w18, w9, w18
+; CHECK-SD-NEXT:    lsr x10, x2, #32
+; CHECK-SD-NEXT:    add w12, w12, w15
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    umov w9, v0.b[4]
+; CHECK-SD-NEXT:    umov w15, v1.b[4]
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    mov v2.b[1], w18
+; CHECK-SD-NEXT:    umull x13, w1, w8
+; CHECK-SD-NEXT:    mov v3.b[1], w12
+; CHECK-SD-NEXT:    sub w2, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w12, w14, w14, lsl #3
+; CHECK-SD-NEXT:    umov w10, v0.b[5]
+; CHECK-SD-NEXT:    umov w18, v1.b[5]
+; CHECK-SD-NEXT:    add w14, w17, w2
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    add w12, w0, w12
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    umull x0, w15, w8
+; CHECK-SD-NEXT:    mov v2.b[2], w14
+; CHECK-SD-NEXT:    mov v3.b[2], w12
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umov w12, v0.b[6]
+; CHECK-SD-NEXT:    add w11, w11, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w14, v1.b[6]
+; CHECK-SD-NEXT:    add w13, w1, w13
+; CHECK-SD-NEXT:    umull x16, w10, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w18, w8
+; CHECK-SD-NEXT:    mov v2.b[3], w11
+; CHECK-SD-NEXT:    mov v3.b[3], w13
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    umov w11, v0.b[7]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w13, v1.b[7]
+; CHECK-SD-NEXT:    umull x17, w12, w8
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w14, w8
+; CHECK-SD-NEXT:    mov v2.b[4], w9
+; CHECK-SD-NEXT:    mov v3.b[4], w15
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umov w9, v0.b[8]
+; CHECK-SD-NEXT:    add w10, w10, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w15, v1.b[8]
+; CHECK-SD-NEXT:    add w18, w18, w1
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w13, w8
+; CHECK-SD-NEXT:    mov v2.b[5], w10
+; CHECK-SD-NEXT:    mov v3.b[5], w18
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w12, w12, w17
+; CHECK-SD-NEXT:    umov w10, v0.b[9]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w18, v1.b[9]
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    add w14, w14, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w15, w8
+; CHECK-SD-NEXT:    mov v2.b[6], w12
+; CHECK-SD-NEXT:    mov v3.b[6], w14
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    umov w12, v0.b[10]
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    add w11, w11, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w14, v1.b[10]
+; CHECK-SD-NEXT:    add w13, w13, w1
+; CHECK-SD-NEXT:    umull x16, w10, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w18, w8
+; CHECK-SD-NEXT:    mov v2.b[7], w11
+; CHECK-SD-NEXT:    mov v3.b[7], w13
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    umull x17, w12, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w11, v0.b[11]
+; CHECK-SD-NEXT:    umov w13, v1.b[11]
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w14, w8
+; CHECK-SD-NEXT:    mov v2.b[8], w9
+; CHECK-SD-NEXT:    mov v3.b[8], w15
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    umov w9, v0.b[12]
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    add w10, w10, w16
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w15, v1.b[12]
+; CHECK-SD-NEXT:    add w18, w18, w1
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    umull x1, w13, w8
+; CHECK-SD-NEXT:    mov v2.b[9], w10
+; CHECK-SD-NEXT:    mov v3.b[9], w18
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w12, w12, w17
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umov w10, v0.b[13]
+; CHECK-SD-NEXT:    umov w18, v1.b[13]
+; CHECK-SD-NEXT:    add w14, w14, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x0, w15, w8
+; CHECK-SD-NEXT:    mov v2.b[10], w12
+; CHECK-SD-NEXT:    mov v3.b[10], w14
+; CHECK-SD-NEXT:    sub w12, w16, w16, lsl #3
+; CHECK-SD-NEXT:    lsr x16, x17, #32
+; CHECK-SD-NEXT:    sub w17, w1, w1, lsl #3
+; CHECK-SD-NEXT:    umov w14, v0.b[14]
+; CHECK-SD-NEXT:    add w11, w11, w12
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    add w13, w13, w17
+; CHECK-SD-NEXT:    umull x17, w18, w8
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    mov v2.b[11], w11
+; CHECK-SD-NEXT:    mov v3.b[11], w13
+; CHECK-SD-NEXT:    umov w11, v1.b[14]
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w16
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umov w13, v0.b[15]
+; CHECK-SD-NEXT:    umov w16, v1.b[15]
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    mov v2.b[12], w9
+; CHECK-SD-NEXT:    umull x9, w14, w8
+; CHECK-SD-NEXT:    mov v3.b[12], w15
+; CHECK-SD-NEXT:    umull x15, w11, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    add w10, w10, w12
+; CHECK-SD-NEXT:    add w12, w18, w17
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    mov v2.b[13], w10
+; CHECK-SD-NEXT:    umull x10, w13, w8
+; CHECK-SD-NEXT:    mov v3.b[13], w12
+; CHECK-SD-NEXT:    umull x8, w16, w8
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w12, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w9, w14, w9
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w11, w11, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v2.b[14], w9
+; CHECK-SD-NEXT:    mov v3.b[14], w11
+; CHECK-SD-NEXT:    sub w9, w10, w10, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    add w8, w16, w8
+; CHECK-SD-NEXT:    mov v2.b[15], w9
+; CHECK-SD-NEXT:    mov v3.b[15], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv32i8_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.8b, #37
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    umull v4.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    umull v6.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umull v3.8h, v3.8b, v2.8b
+; CHECK-GI-NEXT:    umull v2.8h, v5.8b, v2.8b
+; CHECK-GI-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-GI-NEXT:    shrn v5.8b, v6.8h, #8
+; CHECK-GI-NEXT:    mov v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.16b, v5.16b
+; CHECK-GI-NEXT:    shrn2 v4.16b, v3.8h, #8
+; CHECK-GI-NEXT:    shrn2 v5.16b, v2.8h, #8
+; CHECK-GI-NEXT:    shrn2 v6.16b, v3.8h, #8
+; CHECK-GI-NEXT:    shrn2 v7.16b, v2.8h, #8
+; CHECK-GI-NEXT:    movi v2.16b, #7
+; CHECK-GI-NEXT:    sub v6.16b, v0.16b, v6.16b
+; CHECK-GI-NEXT:    sub v7.16b, v1.16b, v7.16b
+; CHECK-GI-NEXT:    usra v4.16b, v6.16b, #1
+; CHECK-GI-NEXT:    usra v5.16b, v7.16b, #1
+; CHECK-GI-NEXT:    ushr v3.16b, v4.16b, #2
+; CHECK-GI-NEXT:    ushr v4.16b, v5.16b, #2
+; CHECK-GI-NEXT:    mls v0.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    mls v1.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <32 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <32 x i8> %s
+}
+
+define <32 x i8> @uv32i8_100(<32 x i8> %d, <32 x i8> %e) {
+; CHECK-SD-LABEL: uv32i8_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w11, v0.b[0]
+; CHECK-SD-NEXT:    umov w14, v1.b[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w10, v0.b[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w13, v1.b[1]
+; CHECK-SD-NEXT:    umov w12, v0.b[2]
+; CHECK-SD-NEXT:    umov w17, v1.b[2]
+; CHECK-SD-NEXT:    umull x15, w11, w8
+; CHECK-SD-NEXT:    umull x1, w14, w8
+; CHECK-SD-NEXT:    umull x9, w10, w8
+; CHECK-SD-NEXT:    umull x18, w13, w8
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    umull x16, w12, w8
+; CHECK-SD-NEXT:    lsr x0, x9, #32
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    msub w11, w15, w9, w11
+; CHECK-SD-NEXT:    umov w15, v0.b[3]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    msub w14, w1, w9, w14
+; CHECK-SD-NEXT:    umov w1, v1.b[3]
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    msub w10, w0, w9, w10
+; CHECK-SD-NEXT:    umull x0, w17, w8
+; CHECK-SD-NEXT:    fmov s2, w11
+; CHECK-SD-NEXT:    umov w11, v0.b[5]
+; CHECK-SD-NEXT:    msub w13, w18, w9, w13
+; CHECK-SD-NEXT:    fmov s3, w14
+; CHECK-SD-NEXT:    umov w14, v1.b[4]
+; CHECK-SD-NEXT:    msub w12, w16, w9, w12
+; CHECK-SD-NEXT:    umov w16, v0.b[4]
+; CHECK-SD-NEXT:    umull x18, w15, w8
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    mov v2.b[1], w10
+; CHECK-SD-NEXT:    mov v3.b[1], w13
+; CHECK-SD-NEXT:    umull x13, w1, w8
+; CHECK-SD-NEXT:    msub w17, w0, w9, w17
+; CHECK-SD-NEXT:    umov w0, v0.b[6]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    umull x10, w16, w8
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    mov v2.b[2], w12
+; CHECK-SD-NEXT:    umull x12, w11, w8
+; CHECK-SD-NEXT:    msub w15, w18, w9, w15
+; CHECK-SD-NEXT:    umov w18, v1.b[5]
+; CHECK-SD-NEXT:    mov v3.b[2], w17
+; CHECK-SD-NEXT:    umull x17, w14, w8
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    msub w13, w13, w9, w1
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    umov w1, v0.b[7]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    umov w16, v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[3], w15
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umull x15, w0, w8
+; CHECK-SD-NEXT:    mov v3.b[3], w13
+; CHECK-SD-NEXT:    umull x13, w18, w8
+; CHECK-SD-NEXT:    msub w14, w17, w9, w14
+; CHECK-SD-NEXT:    umov w17, v0.b[8]
+; CHECK-SD-NEXT:    msub w11, w12, w9, w11
+; CHECK-SD-NEXT:    umov w12, v1.b[7]
+; CHECK-SD-NEXT:    mov v2.b[4], w10
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    umull x10, w1, w8
+; CHECK-SD-NEXT:    mov v3.b[4], w14
+; CHECK-SD-NEXT:    umull x14, w16, w8
+; CHECK-SD-NEXT:    msub w13, w13, w9, w18
+; CHECK-SD-NEXT:    umov w18, v0.b[9]
+; CHECK-SD-NEXT:    msub w15, w15, w9, w0
+; CHECK-SD-NEXT:    umov w0, v1.b[8]
+; CHECK-SD-NEXT:    mov v2.b[5], w11
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umull x11, w17, w8
+; CHECK-SD-NEXT:    mov v3.b[5], w13
+; CHECK-SD-NEXT:    umull x13, w12, w8
+; CHECK-SD-NEXT:    msub w14, w14, w9, w16
+; CHECK-SD-NEXT:    umov w16, v0.b[10]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w1
+; CHECK-SD-NEXT:    umov w1, v1.b[9]
+; CHECK-SD-NEXT:    mov v2.b[6], w15
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    umull x15, w18, w8
+; CHECK-SD-NEXT:    mov v3.b[6], w14
+; CHECK-SD-NEXT:    umull x14, w0, w8
+; CHECK-SD-NEXT:    msub w12, w13, w9, w12
+; CHECK-SD-NEXT:    umov w13, v0.b[11]
+; CHECK-SD-NEXT:    msub w11, w11, w9, w17
+; CHECK-SD-NEXT:    umov w17, v1.b[10]
+; CHECK-SD-NEXT:    mov v2.b[7], w10
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    umull x10, w16, w8
+; CHECK-SD-NEXT:    mov v3.b[7], w12
+; CHECK-SD-NEXT:    umull x12, w1, w8
+; CHECK-SD-NEXT:    msub w14, w14, w9, w0
+; CHECK-SD-NEXT:    umov w0, v1.b[11]
+; CHECK-SD-NEXT:    msub w15, w15, w9, w18
+; CHECK-SD-NEXT:    mov v2.b[8], w11
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    umull x18, w17, w8
+; CHECK-SD-NEXT:    mov v3.b[8], w14
+; CHECK-SD-NEXT:    umov w14, v0.b[12]
+; CHECK-SD-NEXT:    umull x11, w13, w8
+; CHECK-SD-NEXT:    msub w12, w12, w9, w1
+; CHECK-SD-NEXT:    umull x1, w0, w8
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    mov v2.b[9], w15
+; CHECK-SD-NEXT:    umov w15, v1.b[12]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    mov v3.b[9], w12
+; CHECK-SD-NEXT:    msub w16, w18, w9, w17
+; CHECK-SD-NEXT:    umov w12, v0.b[13]
+; CHECK-SD-NEXT:    lsr x18, x1, #32
+; CHECK-SD-NEXT:    umull x17, w14, w8
+; CHECK-SD-NEXT:    mov v2.b[10], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w13
+; CHECK-SD-NEXT:    umov w13, v0.b[14]
+; CHECK-SD-NEXT:    msub w18, w18, w9, w0
+; CHECK-SD-NEXT:    umov w0, v1.b[13]
+; CHECK-SD-NEXT:    umull x10, w15, w8
+; CHECK-SD-NEXT:    mov v3.b[10], w16
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    umull x16, w12, w8
+; CHECK-SD-NEXT:    msub w14, w17, w9, w14
+; CHECK-SD-NEXT:    umov w17, v1.b[14]
+; CHECK-SD-NEXT:    mov v2.b[11], w11
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umull x11, w13, w8
+; CHECK-SD-NEXT:    mov v3.b[11], w18
+; CHECK-SD-NEXT:    umull x18, w0, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    msub w10, w10, w9, w15
+; CHECK-SD-NEXT:    umov w15, v0.b[15]
+; CHECK-SD-NEXT:    msub w12, w16, w9, w12
+; CHECK-SD-NEXT:    mov v2.b[12], w14
+; CHECK-SD-NEXT:    umov w14, v1.b[15]
+; CHECK-SD-NEXT:    lsr x16, x18, #32
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    mov v3.b[12], w10
+; CHECK-SD-NEXT:    umull x10, w17, w8
+; CHECK-SD-NEXT:    msub w16, w16, w9, w0
+; CHECK-SD-NEXT:    msub w11, w11, w9, w13
+; CHECK-SD-NEXT:    mov v2.b[13], w12
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umull x13, w15, w8
+; CHECK-SD-NEXT:    mov v3.b[13], w16
+; CHECK-SD-NEXT:    umull x8, w14, w8
+; CHECK-SD-NEXT:    msub w10, w10, w9, w17
+; CHECK-SD-NEXT:    lsr x12, x13, #32
+; CHECK-SD-NEXT:    mov v2.b[14], w11
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v3.b[14], w10
+; CHECK-SD-NEXT:    msub w11, w12, w9, w15
+; CHECK-SD-NEXT:    msub w8, w8, w9, w14
+; CHECK-SD-NEXT:    mov v2.b[15], w11
+; CHECK-SD-NEXT:    mov v3.b[15], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv32i8_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.8b, #41
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    umull v5.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    umull v6.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umull v3.8h, v3.8b, v2.8b
+; CHECK-GI-NEXT:    umull v2.8h, v4.8b, v2.8b
+; CHECK-GI-NEXT:    shrn v4.8b, v5.8h, #8
+; CHECK-GI-NEXT:    shrn v5.8b, v6.8h, #8
+; CHECK-GI-NEXT:    shrn2 v4.16b, v3.8h, #8
+; CHECK-GI-NEXT:    shrn2 v5.16b, v2.8h, #8
+; CHECK-GI-NEXT:    movi v2.16b, #100
+; CHECK-GI-NEXT:    ushr v3.16b, v4.16b, #4
+; CHECK-GI-NEXT:    ushr v4.16b, v5.16b, #4
+; CHECK-GI-NEXT:    mls v0.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    mls v1.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <32 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
+  ret <32 x i8> %s
+}
+
+define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: sv2i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v1.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v1.2s, #16
+; CHECK-SD-NEXT:    smull v2.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #32
+; CHECK-SD-NEXT:    ssra v2.2s, v1.2s, #16
+; CHECK-SD-NEXT:    sshr v1.2s, v2.2s, #2
+; CHECK-SD-NEXT:    usra v1.2s, v2.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i16> %d, <i16 7, i16 7>
+  ret <2 x i16> %s
+}
+
+define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: sv2i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i16> %d, <i16 100, i16 100>
+  ret <2 x i16> %s
+}
+
+define <3 x i16> @sv3i16_7(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: sv3i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov w12, v0.h[0]
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x8, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.h[2]
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    add w9, w9, w12
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    asr w14, w9, #2
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    asr w15, w10, #2
+; CHECK-SD-NEXT:    add w8, w8, w11
+; CHECK-SD-NEXT:    add w9, w14, w9, lsr #31
+; CHECK-SD-NEXT:    asr w14, w8, #2
+; CHECK-SD-NEXT:    add w10, w15, w10, lsr #31
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    add w8, w14, w8, lsr #31
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    add w9, w12, w9
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w13, w10
+; CHECK-SD-NEXT:    add w8, w11, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w13, v0.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w9, w9, w10
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w8, w13, w8
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w10, w15, w12
+; CHECK-GI-NEXT:    sub w10, w11, w10
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w8, w13, w8
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i16> %d, <i16 7, i16 7, i16 7>
+  ret <3 x i16> %s
+}
+
+define <3 x i16> @sv3i16_100(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: sv3i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    smov x9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    smov w13, v0.h[1]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x8, w11, w8
+; CHECK-SD-NEXT:    smov w11, v0.h[0]
+; CHECK-SD-NEXT:    asr x9, x9, #37
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    add w9, w9, w9, lsr #31
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    msub w9, w9, w12, w11
+; CHECK-SD-NEXT:    smov w11, v0.h[2]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    msub w10, w10, w12, w13
+; CHECK-SD-NEXT:    msub w8, w8, w12, w11
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    smov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    smov w11, v0.h[1]
+; CHECK-GI-NEXT:    smov w13, v0.h[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    msub w9, w10, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w14, w13, w8
+; CHECK-GI-NEXT:    msub w10, w12, w8, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    msub w8, w14, w8, w13
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i16> %d, <i16 100, i16 100, i16 100>
+  ret <3 x i16> %s
+}
+
+define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: sv4i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movi v2.4h, #7
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #17
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    usra v1.4h, v1.4h, #15
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v2.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
+  ret <4 x i16> %s
+}
+
+define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: sv4i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    movi v2.4h, #100
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #19
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    usra v1.4h, v1.4h, #15
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v2.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: sv8i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    usra v1.8h, v1.8h, #15
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    movi v4.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: sv8i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    sshr v1.8h, v1.8h, #3
+; CHECK-SD-NEXT:    usra v1.8h, v1.8h, #15
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    movi v4.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    mov w10, v1.s[1]
+; CHECK-GI-NEXT:    mov w14, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v1.s[2]
+; CHECK-GI-NEXT:    mov w15, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
+; CHECK-GI-NEXT:    mov w16, v0.s[3]
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %s
+}
+
+define <16 x i16> @sv16i16_7(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: sv16i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smov x10, v0.h[0]
+; CHECK-SD-NEXT:    smov x9, v0.h[1]
+; CHECK-SD-NEXT:    mov x8, #-56173 // =0xffffffffffff2493
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    smov w15, v0.h[0]
+; CHECK-SD-NEXT:    smov w12, v0.h[1]
+; CHECK-SD-NEXT:    smov x18, v0.h[3]
+; CHECK-SD-NEXT:    smov w14, v0.h[2]
+; CHECK-SD-NEXT:    smov x3, v1.h[1]
+; CHECK-SD-NEXT:    smov x16, v0.h[4]
+; CHECK-SD-NEXT:    smov x5, v1.h[0]
+; CHECK-SD-NEXT:    smull x0, w10, w8
+; CHECK-SD-NEXT:    smov w13, v0.h[3]
+; CHECK-SD-NEXT:    smov w10, v0.h[4]
+; CHECK-SD-NEXT:    smull x9, w9, w8
+; CHECK-SD-NEXT:    smov x17, v0.h[5]
+; CHECK-SD-NEXT:    smull x1, w11, w8
+; CHECK-SD-NEXT:    smov x11, v0.h[6]
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    lsr x2, x9, #32
+; CHECK-SD-NEXT:    smull x3, w3, w8
+; CHECK-SD-NEXT:    smov w9, v0.h[5]
+; CHECK-SD-NEXT:    add w0, w0, w15
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    smull x7, w16, w8
+; CHECK-SD-NEXT:    add w2, w2, w12
+; CHECK-SD-NEXT:    asr w6, w0, #2
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    asr w4, w2, #2
+; CHECK-SD-NEXT:    add w1, w1, w14
+; CHECK-SD-NEXT:    smov w16, v1.h[1]
+; CHECK-SD-NEXT:    add w0, w6, w0, lsr #31
+; CHECK-SD-NEXT:    smull x5, w5, w8
+; CHECK-SD-NEXT:    add w18, w18, w13
+; CHECK-SD-NEXT:    add w2, w4, w2, lsr #31
+; CHECK-SD-NEXT:    asr w4, w1, #2
+; CHECK-SD-NEXT:    lsr x3, x3, #32
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    asr w6, w18, #2
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    add w1, w4, w1, lsr #31
+; CHECK-SD-NEXT:    smov x4, v1.h[2]
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    smov w0, v1.h[0]
+; CHECK-SD-NEXT:    add w3, w3, w16
+; CHECK-SD-NEXT:    fmov s2, w15
+; CHECK-SD-NEXT:    add w2, w12, w2
+; CHECK-SD-NEXT:    lsr x5, x5, #32
+; CHECK-SD-NEXT:    add w18, w6, w18, lsr #31
+; CHECK-SD-NEXT:    lsr x6, x7, #32
+; CHECK-SD-NEXT:    sub w1, w1, w1, lsl #3
+; CHECK-SD-NEXT:    smull x4, w4, w8
+; CHECK-SD-NEXT:    smov w7, v1.h[3]
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    mov v2.h[1], w2
+; CHECK-SD-NEXT:    asr w2, w3, #2
+; CHECK-SD-NEXT:    add w5, w5, w0
+; CHECK-SD-NEXT:    add w6, w6, w10
+; CHECK-SD-NEXT:    add w14, w14, w1
+; CHECK-SD-NEXT:    smov x1, v1.h[3]
+; CHECK-SD-NEXT:    add w2, w2, w3, lsr #31
+; CHECK-SD-NEXT:    lsr x3, x4, #32
+; CHECK-SD-NEXT:    asr w4, w5, #2
+; CHECK-SD-NEXT:    asr w15, w6, #2
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w17, w17, w9
+; CHECK-SD-NEXT:    add w4, w4, w5, lsr #31
+; CHECK-SD-NEXT:    sub w2, w2, w2, lsl #3
+; CHECK-SD-NEXT:    mov v2.h[2], w14
+; CHECK-SD-NEXT:    add w6, w15, w6, lsr #31
+; CHECK-SD-NEXT:    smov w15, v1.h[2]
+; CHECK-SD-NEXT:    add w13, w13, w18
+; CHECK-SD-NEXT:    sub w4, w4, w4, lsl #3
+; CHECK-SD-NEXT:    smov x18, v1.h[4]
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    add w16, w16, w2
+; CHECK-SD-NEXT:    asr w5, w17, #2
+; CHECK-SD-NEXT:    sub w6, w6, w6, lsl #3
+; CHECK-SD-NEXT:    add w0, w0, w4
+; CHECK-SD-NEXT:    mov v2.h[3], w13
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    add w3, w3, w15
+; CHECK-SD-NEXT:    fmov s3, w0
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    asr w2, w3, #2
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    add w10, w10, w6
+; CHECK-SD-NEXT:    add w17, w5, w17, lsr #31
+; CHECK-SD-NEXT:    smov w12, v0.h[6]
+; CHECK-SD-NEXT:    smov x14, v0.h[7]
+; CHECK-SD-NEXT:    add w0, w2, w3, lsr #31
+; CHECK-SD-NEXT:    smov w2, v1.h[4]
+; CHECK-SD-NEXT:    mov v3.h[1], w16
+; CHECK-SD-NEXT:    add w16, w1, w7
+; CHECK-SD-NEXT:    smov x1, v1.h[5]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    asr w3, w16, #2
+; CHECK-SD-NEXT:    sub w0, w0, w0, lsl #3
+; CHECK-SD-NEXT:    mov v2.h[4], w10
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    smull x14, w14, w8
+; CHECK-SD-NEXT:    add w16, w3, w16, lsr #31
+; CHECK-SD-NEXT:    add w15, w15, w0
+; CHECK-SD-NEXT:    add w18, w18, w2
+; CHECK-SD-NEXT:    smull x1, w1, w8
+; CHECK-SD-NEXT:    smov x3, v1.h[6]
+; CHECK-SD-NEXT:    asr w0, w18, #2
+; CHECK-SD-NEXT:    mov v3.h[2], w15
+; CHECK-SD-NEXT:    sub w15, w16, w16, lsl #3
+; CHECK-SD-NEXT:    smov w16, v1.h[5]
+; CHECK-SD-NEXT:    add w10, w0, w18, lsr #31
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    add w11, w11, w12
+; CHECK-SD-NEXT:    lsr x18, x1, #32
+; CHECK-SD-NEXT:    add w15, w7, w15
+; CHECK-SD-NEXT:    smov x1, v1.h[7]
+; CHECK-SD-NEXT:    smull x0, w3, w8
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    smov w13, v0.h[7]
+; CHECK-SD-NEXT:    mov v3.h[3], w15
+; CHECK-SD-NEXT:    add w15, w18, w16
+; CHECK-SD-NEXT:    smov w18, v1.h[6]
+; CHECK-SD-NEXT:    asr w17, w15, #2
+; CHECK-SD-NEXT:    add w10, w2, w10
+; CHECK-SD-NEXT:    mov v2.h[5], w9
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    smull x8, w1, w8
+; CHECK-SD-NEXT:    lsr x9, x14, #32
+; CHECK-SD-NEXT:    add w15, w17, w15, lsr #31
+; CHECK-SD-NEXT:    asr w17, w11, #2
+; CHECK-SD-NEXT:    mov v3.h[4], w10
+; CHECK-SD-NEXT:    add w10, w0, w18
+; CHECK-SD-NEXT:    smov w0, v1.h[7]
+; CHECK-SD-NEXT:    add w11, w17, w11, lsr #31
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    asr w17, w10, #2
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w9, w9, w13
+; CHECK-SD-NEXT:    add w14, w16, w15
+; CHECK-SD-NEXT:    add w10, w17, w10, lsr #31
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[5], w14
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    asr w14, w9, #2
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    asr w15, w8, #2
+; CHECK-SD-NEXT:    add w11, w12, w11
+; CHECK-SD-NEXT:    add w9, w14, w9, lsr #31
+; CHECK-SD-NEXT:    mov v2.h[6], w11
+; CHECK-SD-NEXT:    add w10, w18, w10
+; CHECK-SD-NEXT:    add w8, w15, w8, lsr #31
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w9, w13, w9
+; CHECK-SD-NEXT:    add w8, w0, w8
+; CHECK-SD-NEXT:    mov v2.h[7], w9
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    movi v16.4h, #7
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w17, s3
+; CHECK-GI-NEXT:    mov w18, v3.s[1]
+; CHECK-GI-NEXT:    mov w0, v3.s[2]
+; CHECK-GI-NEXT:    mov w1, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[2]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    mls v1.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %s
+}
+
+define <16 x i16> @sv16i16_100(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: sv16i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    smov x10, v0.h[0]
+; CHECK-SD-NEXT:    smov x11, v0.h[2]
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    smov x14, v1.h[0]
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    smov x9, v0.h[1]
+; CHECK-SD-NEXT:    smov x13, v1.h[1]
+; CHECK-SD-NEXT:    smov x17, v1.h[2]
+; CHECK-SD-NEXT:    smov w16, v0.h[0]
+; CHECK-SD-NEXT:    smov w18, v0.h[2]
+; CHECK-SD-NEXT:    smov x0, v0.h[3]
+; CHECK-SD-NEXT:    smov w15, v0.h[1]
+; CHECK-SD-NEXT:    smull x10, w10, w8
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smull x14, w14, w8
+; CHECK-SD-NEXT:    smull x12, w9, w8
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    smull x13, w13, w8
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    asr x14, x14, #37
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    asr x12, x12, #37
+; CHECK-SD-NEXT:    smull x0, w0, w8
+; CHECK-SD-NEXT:    asr x13, x13, #37
+; CHECK-SD-NEXT:    msub w10, w10, w9, w16
+; CHECK-SD-NEXT:    add w14, w14, w14, lsr #31
+; CHECK-SD-NEXT:    msub w11, w11, w9, w18
+; CHECK-SD-NEXT:    smov w18, v1.h[0]
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    smov w16, v1.h[1]
+; CHECK-SD-NEXT:    add w13, w13, w13, lsr #31
+; CHECK-SD-NEXT:    asr x17, x17, #37
+; CHECK-SD-NEXT:    msub w14, w14, w9, w18
+; CHECK-SD-NEXT:    smov x18, v1.h[3]
+; CHECK-SD-NEXT:    fmov s2, w10
+; CHECK-SD-NEXT:    msub w12, w12, w9, w15
+; CHECK-SD-NEXT:    smov x15, v0.h[4]
+; CHECK-SD-NEXT:    smov w10, v1.h[2]
+; CHECK-SD-NEXT:    msub w13, w13, w9, w16
+; CHECK-SD-NEXT:    smov x16, v0.h[5]
+; CHECK-SD-NEXT:    add w17, w17, w17, lsr #31
+; CHECK-SD-NEXT:    fmov s3, w14
+; CHECK-SD-NEXT:    asr x0, x0, #37
+; CHECK-SD-NEXT:    smov w14, v0.h[4]
+; CHECK-SD-NEXT:    mov v2.h[1], w12
+; CHECK-SD-NEXT:    msub w10, w17, w9, w10
+; CHECK-SD-NEXT:    smov x17, v1.h[4]
+; CHECK-SD-NEXT:    smull x18, w18, w8
+; CHECK-SD-NEXT:    smov w12, v0.h[3]
+; CHECK-SD-NEXT:    add w0, w0, w0, lsr #31
+; CHECK-SD-NEXT:    smull x15, w15, w8
+; CHECK-SD-NEXT:    mov v3.h[1], w13
+; CHECK-SD-NEXT:    smov x13, v1.h[5]
+; CHECK-SD-NEXT:    smull x16, w16, w8
+; CHECK-SD-NEXT:    asr x18, x18, #37
+; CHECK-SD-NEXT:    mov v2.h[2], w11
+; CHECK-SD-NEXT:    smull x17, w17, w8
+; CHECK-SD-NEXT:    asr x15, x15, #37
+; CHECK-SD-NEXT:    msub w12, w0, w9, w12
+; CHECK-SD-NEXT:    smov w0, v1.h[3]
+; CHECK-SD-NEXT:    asr x16, x16, #37
+; CHECK-SD-NEXT:    add w18, w18, w18, lsr #31
+; CHECK-SD-NEXT:    mov v3.h[2], w10
+; CHECK-SD-NEXT:    add w15, w15, w15, lsr #31
+; CHECK-SD-NEXT:    smull x10, w13, w8
+; CHECK-SD-NEXT:    asr x17, x17, #37
+; CHECK-SD-NEXT:    add w13, w16, w16, lsr #31
+; CHECK-SD-NEXT:    msub w16, w18, w9, w0
+; CHECK-SD-NEXT:    smov x11, v0.h[6]
+; CHECK-SD-NEXT:    mov v2.h[3], w12
+; CHECK-SD-NEXT:    smov x12, v1.h[6]
+; CHECK-SD-NEXT:    msub w14, w15, w9, w14
+; CHECK-SD-NEXT:    add w15, w17, w17, lsr #31
+; CHECK-SD-NEXT:    smov w17, v1.h[4]
+; CHECK-SD-NEXT:    asr x10, x10, #37
+; CHECK-SD-NEXT:    mov v3.h[3], w16
+; CHECK-SD-NEXT:    smov w18, v0.h[5]
+; CHECK-SD-NEXT:    smov x16, v0.h[7]
+; CHECK-SD-NEXT:    msub w15, w15, w9, w17
+; CHECK-SD-NEXT:    smov x0, v1.h[7]
+; CHECK-SD-NEXT:    add w10, w10, w10, lsr #31
+; CHECK-SD-NEXT:    smull x11, w11, w8
+; CHECK-SD-NEXT:    smov w17, v1.h[5]
+; CHECK-SD-NEXT:    mov v2.h[4], w14
+; CHECK-SD-NEXT:    smull x12, w12, w8
+; CHECK-SD-NEXT:    msub w13, w13, w9, w18
+; CHECK-SD-NEXT:    mov v3.h[4], w15
+; CHECK-SD-NEXT:    smov w15, v0.h[6]
+; CHECK-SD-NEXT:    msub w10, w10, w9, w17
+; CHECK-SD-NEXT:    asr x11, x11, #37
+; CHECK-SD-NEXT:    asr x12, x12, #37
+; CHECK-SD-NEXT:    smull x14, w16, w8
+; CHECK-SD-NEXT:    smull x8, w0, w8
+; CHECK-SD-NEXT:    add w11, w11, w11, lsr #31
+; CHECK-SD-NEXT:    mov v2.h[5], w13
+; CHECK-SD-NEXT:    add w12, w12, w12, lsr #31
+; CHECK-SD-NEXT:    smov w13, v1.h[6]
+; CHECK-SD-NEXT:    mov v3.h[5], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w15
+; CHECK-SD-NEXT:    asr x14, x14, #37
+; CHECK-SD-NEXT:    msub w10, w12, w9, w13
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    smov w12, v0.h[7]
+; CHECK-SD-NEXT:    add w13, w14, w14, lsr #31
+; CHECK-SD-NEXT:    smov w14, v1.h[7]
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    mov v2.h[6], w11
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    msub w11, w13, w9, w12
+; CHECK-SD-NEXT:    msub w8, w8, w9, w14
+; CHECK-SD-NEXT:    mov v2.h[7], w11
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv16i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    movi v16.4h, #100
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w17, s3
+; CHECK-GI-NEXT:    mov w18, v3.s[1]
+; CHECK-GI-NEXT:    mov w0, v3.s[2]
+; CHECK-GI-NEXT:    mov w1, v3.s[3]
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sdiv w11, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[1]
+; CHECK-GI-NEXT:    fmov w2, s3
+; CHECK-GI-NEXT:    mov w3, v3.s[1]
+; CHECK-GI-NEXT:    mov w4, v3.s[2]
+; CHECK-GI-NEXT:    mov w5, v3.s[3]
+; CHECK-GI-NEXT:    sdiv w12, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[2]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    mov w9, v2.s[3]
+; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mov v4.s[1], w12
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    fmov w13, s2
+; CHECK-GI-NEXT:    mov w14, v2.s[1]
+; CHECK-GI-NEXT:    mov w15, v2.s[2]
+; CHECK-GI-NEXT:    mov w16, v2.s[3]
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    mov v4.s[2], w10
+; CHECK-GI-NEXT:    sdiv w17, w17, w8
+; CHECK-GI-NEXT:    fmov s5, w13
+; CHECK-GI-NEXT:    sdiv w2, w2, w8
+; CHECK-GI-NEXT:    fmov s6, w17
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    fmov s7, w2
+; CHECK-GI-NEXT:    sdiv w18, w18, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w14
+; CHECK-GI-NEXT:    sdiv w3, w3, w8
+; CHECK-GI-NEXT:    mov v6.s[1], w18
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v7.s[1], w3
+; CHECK-GI-NEXT:    sdiv w0, w0, w8
+; CHECK-GI-NEXT:    mov v5.s[2], w15
+; CHECK-GI-NEXT:    sdiv w4, w4, w8
+; CHECK-GI-NEXT:    mov v6.s[2], w0
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    mov v7.s[2], w4
+; CHECK-GI-NEXT:    sdiv w16, w16, w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w1, w1, w8
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
+; CHECK-GI-NEXT:    sdiv w8, w5, w8
+; CHECK-GI-NEXT:    mov v6.s[3], w1
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    mls v1.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <16 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <16 x i16> %s
+}
+
+define <2 x i16> @uv2i16_7(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: uv2i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    and v2.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    neg v3.4h, v3.4h
+; CHECK-GI-NEXT:    ushr v2.2s, v1.2s, #16
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
+; CHECK-GI-NEXT:    ushl v2.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    usra v2.2s, v1.2s, #16
+; CHECK-GI-NEXT:    uzp1 v1.4h, v2.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    dup v2.2s, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i16> %d, <i16 7, i16 7>
+  ret <2 x i16> %s
+}
+
+define <2 x i16> @uv2i16_100(<2 x i16> %d, <2 x i16> %e) {
+; CHECK-SD-LABEL: uv2i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    dup v2.2s, w8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    ushl v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    dup v2.2s, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i16> %d, <i16 100, i16 100>
+  ret <2 x i16> %s
+}
+
+define <3 x i16> @uv3i16_7(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: uv3i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    sub w11, w11, w11, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w11
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    add w10, w10, w13
+; CHECK-SD-NEXT:    add w8, w12, w8
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov w9, #16 // =0x10
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    mov w9, #2 // =0x2
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sub v4.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    neg v3.4h, v3.4h
+; CHECK-GI-NEXT:    ushl v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NEXT:    add v1.4h, v3.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v4.4h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i16> %d, <i16 7, i16 7, i16 7>
+  ret <3 x i16> %s
+}
+
+define <3 x i16> @uv3i16_100(<3 x i16> %d, <3 x i16> %e) {
+; CHECK-SD-LABEL: uv3i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w9, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    umull x11, w9, w8
+; CHECK-SD-NEXT:    umull x13, w10, w8
+; CHECK-SD-NEXT:    umull x8, w12, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    msub w9, w11, w14, w9
+; CHECK-SD-NEXT:    lsr x11, x13, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    msub w10, w11, w14, w10
+; CHECK-SD-NEXT:    fmov s0, w9
+; CHECK-SD-NEXT:    msub w8, w8, w14, w12
+; CHECK-SD-NEXT:    mov v0.h[1], w10
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov w11, #5243 // =0x147b
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w11
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    mov v1.h[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    neg v1.4h, v1.4h
+; CHECK-GI-NEXT:    ushl v1.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-NEXT:    umov w9, v1.h[1]
+; CHECK-GI-NEXT:    umov w10, v1.h[2]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov w8, #16 // =0x10
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w10
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    neg v3.4h, v4.4h
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i16> %d, <i16 100, i16 100, i16 100>
+  ret <3 x i16> %s
+}
+
+define <4 x i16> @uv4i16_7(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: uv4i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-SD-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    shrn v2.4h, v2.4s, #1
+; CHECK-SD-NEXT:    add v1.4h, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.4h, #7
+; CHECK-SD-NEXT:    ushr v1.4h, v1.4h, #2
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI58_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI58_0]
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    usra v1.4h, v2.4h, #1
+; CHECK-GI-NEXT:    movi v2.4h, #7
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #2
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
+  ret <4 x i16> %s
+}
+
+define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) {
+; CHECK-SD-LABEL: uv4i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.4h, v0.4h, #2
+; CHECK-SD-NEXT:    dup v1.4h, w8
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.4h, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #17
+; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI59_0
+; CHECK-GI-NEXT:    ushr v1.4h, v0.4h, #2
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.4h, #100
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    ushr v1.4h, v1.4h, #1
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @uv8i16_7(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: uv8i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-SD-NEXT:    movi v2.8h, #7
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI60_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sub v2.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usra v1.8h, v2.8h, #1
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %s
+}
+
+define <8 x i16> @uv8i16_100(<8 x i16> %d, <8 x i16> %e) {
+; CHECK-SD-LABEL: uv8i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-SD-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-SD-NEXT:    dup v1.8h, w8
+; CHECK-SD-NEXT:    umull2 v3.4s, v2.8h, v1.8h
+; CHECK-SD-NEXT:    umull v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT:    movi v2.8h, #100
+; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-SD-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI61_0
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT:    umull2 v3.4s, v1.8h, v2.8h
+; CHECK-GI-NEXT:    umull v1.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    movi v2.8h, #100
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushr v1.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %s
+}
+
+define <16 x i16> @uv16i16_7(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: uv16i16_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w9, v0.h[1]
+; CHECK-SD-NEXT:    umov w10, v0.h[0]
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    umov w12, v1.h[0]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    umov w11, v1.h[1]
+; CHECK-SD-NEXT:    umov w17, v0.h[2]
+; CHECK-SD-NEXT:    umov w18, v1.h[2]
+; CHECK-SD-NEXT:    umov w0, v0.h[3]
+; CHECK-SD-NEXT:    umov w1, v1.h[3]
+; CHECK-SD-NEXT:    umull x13, w9, w8
+; CHECK-SD-NEXT:    umull x14, w10, w8
+; CHECK-SD-NEXT:    umull x16, w12, w8
+; CHECK-SD-NEXT:    umull x15, w11, w8
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    sub w16, w16, w16, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w13
+; CHECK-SD-NEXT:    umull x13, w17, w8
+; CHECK-SD-NEXT:    add w10, w10, w14
+; CHECK-SD-NEXT:    umull x14, w18, w8
+; CHECK-SD-NEXT:    sub w15, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w12, w12, w16
+; CHECK-SD-NEXT:    fmov s2, w10
+; CHECK-SD-NEXT:    umov w16, v1.h[4]
+; CHECK-SD-NEXT:    fmov s3, w12
+; CHECK-SD-NEXT:    add w11, w11, w15
+; CHECK-SD-NEXT:    lsr x13, x13, #32
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    umov w15, v0.h[4]
+; CHECK-SD-NEXT:    umull x10, w0, w8
+; CHECK-SD-NEXT:    umull x12, w1, w8
+; CHECK-SD-NEXT:    mov v2.h[1], w9
+; CHECK-SD-NEXT:    sub w13, w13, w13, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[1], w11
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    umov w9, v0.h[5]
+; CHECK-SD-NEXT:    add w13, w17, w13
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    umov w11, v1.h[5]
+; CHECK-SD-NEXT:    add w14, w18, w14
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    umull x17, w15, w8
+; CHECK-SD-NEXT:    umull x18, w16, w8
+; CHECK-SD-NEXT:    mov v2.h[2], w13
+; CHECK-SD-NEXT:    sub w10, w10, w10, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[2], w14
+; CHECK-SD-NEXT:    sub w12, w12, w12, lsl #3
+; CHECK-SD-NEXT:    umov w13, v0.h[6]
+; CHECK-SD-NEXT:    lsr x14, x17, #32
+; CHECK-SD-NEXT:    add w10, w0, w10
+; CHECK-SD-NEXT:    umull x17, w9, w8
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    add w12, w1, w12
+; CHECK-SD-NEXT:    umull x0, w11, w8
+; CHECK-SD-NEXT:    mov v2.h[3], w10
+; CHECK-SD-NEXT:    umov w10, v1.h[6]
+; CHECK-SD-NEXT:    sub w14, w14, w14, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[3], w12
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    add w14, w15, w14
+; CHECK-SD-NEXT:    umov w12, v0.h[7]
+; CHECK-SD-NEXT:    add w15, w16, w18
+; CHECK-SD-NEXT:    lsr x18, x0, #32
+; CHECK-SD-NEXT:    umov w16, v1.h[7]
+; CHECK-SD-NEXT:    mov v2.h[4], w14
+; CHECK-SD-NEXT:    umull x14, w13, w8
+; CHECK-SD-NEXT:    sub w17, w17, w17, lsl #3
+; CHECK-SD-NEXT:    mov v3.h[4], w15
+; CHECK-SD-NEXT:    umull x15, w10, w8
+; CHECK-SD-NEXT:    sub w18, w18, w18, lsl #3
+; CHECK-SD-NEXT:    add w9, w9, w17
+; CHECK-SD-NEXT:    add w11, w11, w18
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    mov v2.h[5], w9
+; CHECK-SD-NEXT:    umull x9, w12, w8
+; CHECK-SD-NEXT:    mov v3.h[5], w11
+; CHECK-SD-NEXT:    umull x8, w16, w8
+; CHECK-SD-NEXT:    sub w11, w14, w14, lsl #3
+; CHECK-SD-NEXT:    sub w14, w15, w15, lsl #3
+; CHECK-SD-NEXT:    add w11, w13, w11
+; CHECK-SD-NEXT:    lsr x9, x9, #32
+; CHECK-SD-NEXT:    add w10, w10, w14
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v2.h[6], w11
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    add w9, w12, w9
+; CHECK-SD-NEXT:    add w8, w16, w8
+; CHECK-SD-NEXT:    mov v2.h[7], w9
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv16i16_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI62_0
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT:    umull v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    umull v6.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umull v3.4s, v3.4h, v2.4h
+; CHECK-GI-NEXT:    umull v2.4s, v5.4h, v2.4h
+; CHECK-GI-NEXT:    shrn v4.4h, v4.4s, #16
+; CHECK-GI-NEXT:    shrn v5.4h, v6.4s, #16
+; CHECK-GI-NEXT:    mov v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.16b, v5.16b
+; CHECK-GI-NEXT:    shrn2 v4.8h, v3.4s, #16
+; CHECK-GI-NEXT:    shrn2 v5.8h, v2.4s, #16
+; CHECK-GI-NEXT:    shrn2 v6.8h, v3.4s, #16
+; CHECK-GI-NEXT:    shrn2 v7.8h, v2.4s, #16
+; CHECK-GI-NEXT:    movi v2.8h, #7
+; CHECK-GI-NEXT:    sub v6.8h, v0.8h, v6.8h
+; CHECK-GI-NEXT:    sub v7.8h, v1.8h, v7.8h
+; CHECK-GI-NEXT:    usra v4.8h, v6.8h, #1
+; CHECK-GI-NEXT:    usra v5.8h, v7.8h, #1
+; CHECK-GI-NEXT:    ushr v3.8h, v4.8h, #2
+; CHECK-GI-NEXT:    ushr v4.8h, v5.8h, #2
+; CHECK-GI-NEXT:    mls v0.8h, v3.8h, v2.8h
+; CHECK-GI-NEXT:    mls v1.8h, v4.8h, v2.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <16 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %s
+}
+
+define <16 x i16> @uv16i16_100(<16 x i16> %d, <16 x i16> %e) {
+; CHECK-SD-LABEL: uv16i16_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umov w11, v0.h[0]
+; CHECK-SD-NEXT:    umov w14, v1.h[0]
+; CHECK-SD-NEXT:    mov w8, #23593 // =0x5c29
+; CHECK-SD-NEXT:    umov w12, v0.h[2]
+; CHECK-SD-NEXT:    umov w10, v0.h[1]
+; CHECK-SD-NEXT:    movk w8, #655, lsl #16
+; CHECK-SD-NEXT:    umov w13, v1.h[1]
+; CHECK-SD-NEXT:    umov w0, v1.h[2]
+; CHECK-SD-NEXT:    mov w9, #100 // =0x64
+; CHECK-SD-NEXT:    umull x16, w11, w8
+; CHECK-SD-NEXT:    umull x1, w14, w8
+; CHECK-SD-NEXT:    umull x17, w12, w8
+; CHECK-SD-NEXT:    umull x15, w10, w8
+; CHECK-SD-NEXT:    lsr x16, x16, #32
+; CHECK-SD-NEXT:    umull x18, w13, w8
+; CHECK-SD-NEXT:    lsr x1, x1, #32
+; CHECK-SD-NEXT:    lsr x17, x17, #32
+; CHECK-SD-NEXT:    msub w11, w16, w9, w11
+; CHECK-SD-NEXT:    lsr x15, x15, #32
+; CHECK-SD-NEXT:    msub w14, w1, w9, w14
+; CHECK-SD-NEXT:    lsr x16, x18, #32
+; CHECK-SD-NEXT:    msub w12, w17, w9, w12
+; CHECK-SD-NEXT:    umov w17, v1.h[3]
+; CHECK-SD-NEXT:    msub w10, w15, w9, w10
+; CHECK-SD-NEXT:    umov w15, v0.h[3]
+; CHECK-SD-NEXT:    fmov s2, w11
+; CHECK-SD-NEXT:    umull x18, w0, w8
+; CHECK-SD-NEXT:    fmov s3, w14
+; CHECK-SD-NEXT:    msub w13, w16, w9, w13
+; CHECK-SD-NEXT:    umov w16, v0.h[4]
+; CHECK-SD-NEXT:    umull x11, w17, w8
+; CHECK-SD-NEXT:    mov v2.h[1], w10
+; CHECK-SD-NEXT:    umov w10, v1.h[4]
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    umull x1, w15, w8
+; CHECK-SD-NEXT:    mov v3.h[1], w13
+; CHECK-SD-NEXT:    umov w13, v0.h[5]
+; CHECK-SD-NEXT:    msub w18, w18, w9, w0
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    lsr x14, x1, #32
+; CHECK-SD-NEXT:    umull x0, w16, w8
+; CHECK-SD-NEXT:    mov v2.h[2], w12
+; CHECK-SD-NEXT:    msub w11, w11, w9, w17
+; CHECK-SD-NEXT:    umov w17, v1.h[5]
+; CHECK-SD-NEXT:    umull x12, w10, w8
+; CHECK-SD-NEXT:    mov v3.h[2], w18
+; CHECK-SD-NEXT:    msub w14, w14, w9, w15
+; CHECK-SD-NEXT:    lsr x0, x0, #32
+; CHECK-SD-NEXT:    umov w15, v0.h[6]
+; CHECK-SD-NEXT:    umull x18, w13, w8
+; CHECK-SD-NEXT:    lsr x12, x12, #32
+; CHECK-SD-NEXT:    msub w16, w0, w9, w16
+; CHECK-SD-NEXT:    umov w0, v1.h[6]
+; CHECK-SD-NEXT:    mov v2.h[3], w14
+; CHECK-SD-NEXT:    mov v3.h[3], w11
+; CHECK-SD-NEXT:    umull x11, w17, w8
+; CHECK-SD-NEXT:    msub w10, w12, w9, w10
+; CHECK-SD-NEXT:    lsr x18, x18, #32
+; CHECK-SD-NEXT:    umov w12, v0.h[7]
+; CHECK-SD-NEXT:    umull x14, w15, w8
+; CHECK-SD-NEXT:    lsr x11, x11, #32
+; CHECK-SD-NEXT:    msub w13, w18, w9, w13
+; CHECK-SD-NEXT:    mov v2.h[4], w16
+; CHECK-SD-NEXT:    mov v3.h[4], w10
+; CHECK-SD-NEXT:    umull x10, w0, w8
+; CHECK-SD-NEXT:    umov w16, v1.h[7]
+; CHECK-SD-NEXT:    msub w11, w11, w9, w17
+; CHECK-SD-NEXT:    lsr x14, x14, #32
+; CHECK-SD-NEXT:    lsr x10, x10, #32
+; CHECK-SD-NEXT:    msub w14, w14, w9, w15
+; CHECK-SD-NEXT:    umull x15, w12, w8
+; CHECK-SD-NEXT:    mov v2.h[5], w13
+; CHECK-SD-NEXT:    mov v3.h[5], w11
+; CHECK-SD-NEXT:    umull x8, w16, w8
+; CHECK-SD-NEXT:    msub w10, w10, w9, w0
+; CHECK-SD-NEXT:    lsr x11, x15, #32
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    mov v2.h[6], w14
+; CHECK-SD-NEXT:    mov v3.h[6], w10
+; CHECK-SD-NEXT:    msub w11, w11, w9, w12
+; CHECK-SD-NEXT:    msub w8, w8, w9, w16
+; CHECK-SD-NEXT:    mov v2.h[7], w11
+; CHECK-SD-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv16i16_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.8h, v0.8h, #2
+; CHECK-GI-NEXT:    ushr v3.8h, v1.8h, #2
+; CHECK-GI-NEXT:    adrp x8, .LCPI63_0
+; CHECK-GI-NEXT:    ldr d4, [x8, :lo12:.LCPI63_0]
+; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    mov d6, v3.d[1]
+; CHECK-GI-NEXT:    umull v2.4s, v2.4h, v4.4h
+; CHECK-GI-NEXT:    umull v3.4s, v3.4h, v4.4h
+; CHECK-GI-NEXT:    umull v5.4s, v5.4h, v4.4h
+; CHECK-GI-NEXT:    umull v4.4s, v6.4h, v4.4h
+; CHECK-GI-NEXT:    shrn v2.4h, v2.4s, #16
+; CHECK-GI-NEXT:    shrn v3.4h, v3.4s, #16
+; CHECK-GI-NEXT:    shrn2 v2.8h, v5.4s, #16
+; CHECK-GI-NEXT:    shrn2 v3.8h, v4.4s, #16
+; CHECK-GI-NEXT:    movi v4.8h, #100
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #1
+; CHECK-GI-NEXT:    ushr v3.8h, v3.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v2.8h, v4.8h
+; CHECK-GI-NEXT:    mls v1.8h, v3.8h, v4.8h
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <16 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <16 x i16> %s
+}
+
+define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: sv2i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-SD-NEXT:    usra v2.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i32> %d, <i32 7, i32 7>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: sv2i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w8, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i32> %d, <i32 100, i32 100>
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: sv3i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v3.2s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    add w8, w8, w9
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    asr w10, w8, #2
+; CHECK-SD-NEXT:    add w8, w10, w8, lsr #31
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-SD-NEXT:    add w8, w9, w8
+; CHECK-SD-NEXT:    usra v2.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    lsl w14, w10, #3
+; CHECK-GI-NEXT:    sub w10, w14, w10
+; CHECK-GI-NEXT:    sub w9, w9, w10
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w8, w13, w8
+; CHECK-GI-NEXT:    lsl w15, w12, #3
+; CHECK-GI-NEXT:    sub w10, w15, w12
+; CHECK-GI-NEXT:    sub w10, w11, w10
+; CHECK-GI-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NEXT:    lsl w9, w8, #3
+; CHECK-GI-NEXT:    sub w8, w9, w8
+; CHECK-GI-NEXT:    sub w8, w13, w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i32> %d, <i32 7, i32 7, i32 7>
+  ret <3 x i32> %s
+}
+
+define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: sv3i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    mov w10, #100 // =0x64
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    smull x8, w9, w8
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    asr x8, x8, #37
+; CHECK-SD-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    msub w8, w8, w10, w9
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    usra v1.2s, v1.2s, #31
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    sdiv w10, w9, w8
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    fmov w13, s0
+; CHECK-GI-NEXT:    sdiv w12, w11, w8
+; CHECK-GI-NEXT:    msub w9, w10, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    sdiv w14, w13, w8
+; CHECK-GI-NEXT:    msub w10, w12, w8, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NEXT:    msub w8, w14, w8, w13
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i32> %d, <i32 100, i32 100, i32 100>
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: sv4i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v3.4s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    sshr v2.4s, v1.4s, #2
+; CHECK-SD-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: sv4i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v3.4s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sshr v2.4s, v1.4s, #5
+; CHECK-SD-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    sdiv w8, w12, w8
+; CHECK-GI-NEXT:    mov v1.s[2], w11
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @sv8i32_7(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: sv8i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-SD-NEXT:    movi v6.4s, #7
+; CHECK-SD-NEXT:    movk w8, #37449, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    smull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    smull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    smull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    smull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    add v3.4s, v3.4s, v0.4s
+; CHECK-SD-NEXT:    add v2.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT:    sshr v4.4s, v3.4s, #2
+; CHECK-SD-NEXT:    sshr v5.4s, v2.4s, #2
+; CHECK-SD-NEXT:    usra v4.4s, v3.4s, #31
+; CHECK-SD-NEXT:    usra v5.4s, v2.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v4.4s, v6.4s
+; CHECK-SD-NEXT:    mls v1.4s, v5.4s, v6.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    fmov w13, s1
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w14, v1.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w15, v1.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov w16, v1.s[3]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    movi v4.4s, #7
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i32> %d, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %s
+}
+
+define <8 x i32> @sv8i32_100(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: sv8i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v6.4s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    smull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    smull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    smull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    smull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    sshr v4.4s, v3.4s, #5
+; CHECK-SD-NEXT:    sshr v5.4s, v2.4s, #5
+; CHECK-SD-NEXT:    usra v4.4s, v3.4s, #31
+; CHECK-SD-NEXT:    usra v5.4s, v2.4s, #31
+; CHECK-SD-NEXT:    mls v0.4s, v4.4s, v6.4s
+; CHECK-SD-NEXT:    mls v1.4s, v5.4s, v6.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv8i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    fmov w13, s1
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w14, v1.s[1]
+; CHECK-GI-NEXT:    mov w11, v0.s[2]
+; CHECK-GI-NEXT:    mov w15, v1.s[2]
+; CHECK-GI-NEXT:    mov w12, v0.s[3]
+; CHECK-GI-NEXT:    mov w16, v1.s[3]
+; CHECK-GI-NEXT:    sdiv w9, w9, w8
+; CHECK-GI-NEXT:    movi v4.4s, #100
+; CHECK-GI-NEXT:    sdiv w13, w13, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    sdiv w10, w10, w8
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    sdiv w14, w14, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    sdiv w11, w11, w8
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    sdiv w15, w15, w8
+; CHECK-GI-NEXT:    mov v2.s[2], w11
+; CHECK-GI-NEXT:    sdiv w12, w12, w8
+; CHECK-GI-NEXT:    mov v3.s[2], w15
+; CHECK-GI-NEXT:    sdiv w8, w16, w8
+; CHECK-GI-NEXT:    mov v2.s[3], w12
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mls v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <8 x i32> %d, <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+  ret <8 x i32> %s
+}
+
+define <2 x i32> @uv2i32_7(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: uv2i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
+; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI72_0
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI72_0]
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    usra v1.2s, v2.2s, #1
+; CHECK-GI-NEXT:    movi v2.2s, #7
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i32> %d, <i32 7, i32 7>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) {
+; CHECK-SD-LABEL: uv2i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI73_0
+; CHECK-GI-NEXT:    movi v2.2s, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI73_0]
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    ushr v1.2s, v1.2s, #5
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i32> %d, <i32 100, i32 100>
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: uv3i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #32
+; CHECK-SD-NEXT:    sub w10, w9, w8
+; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-SD-NEXT:    add w8, w8, w10, lsr #1
+; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    add w8, w9, w8
+; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
+; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    movi v2.2s, #7
+; CHECK-SD-NEXT:    ushr v1.2s, v1.2s, #2
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI74_0
+; CHECK-GI-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI74_0]
+; CHECK-GI-NEXT:    mov w8, v0.s[2]
+; CHECK-GI-NEXT:    movk w9, #9362, lsl #16
+; CHECK-GI-NEXT:    mov w10, #1 // =0x1
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    umull x8, w8, w9
+; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov w9, #2 // =0x2
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov v1.s[1], w11
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mov v3.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w10
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NEXT:    sub v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    neg v2.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i32> %d, <i32 7, i32 7, i32 7>
+  ret <3 x i32> %s
+}
+
+define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) {
+; CHECK-SD-LABEL: uv3i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    movi v2.2s, #100
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    mov w10, #100 // =0x64
+; CHECK-SD-NEXT:    dup v1.2s, w8
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    lsr x8, x8, #37
+; CHECK-SD-NEXT:    msub w8, w8, w10, w9
+; CHECK-SD-NEXT:    ushr v1.2d, v1.2d, #37
+; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    mov v0.s[2], w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI75_0
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI75_0]
+; CHECK-GI-NEXT:    mov w8, #5 // =0x5
+; CHECK-GI-NEXT:    mov w10, #34079 // =0x851f
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    movk w10, #20971, lsl #16
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    umull x9, w9, w10
+; CHECK-GI-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    lsr x8, x9, #32
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
+; CHECK-GI-NEXT:    neg v3.4s, v3.4s
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    fmov s1, w11
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    mov v2.s[1], w10
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w10
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i32> %d, <i32 100, i32 100, i32 100>
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @uv4i32_7(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: uv4i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-SD-NEXT:    movi v2.4s, #7
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI76_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI76_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sub v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @uv4i32_100(<4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: uv4i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    movi v2.4s, #100
+; CHECK-SD-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI77_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI77_0]
+; CHECK-GI-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @uv8i32_7(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: uv8i32_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    umull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    umull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    umull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    sub v4.4s, v0.4s, v3.4s
+; CHECK-SD-NEXT:    sub v5.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    usra v3.4s, v4.4s, #1
+; CHECK-SD-NEXT:    movi v4.4s, #7
+; CHECK-SD-NEXT:    usra v2.4s, v5.4s, #1
+; CHECK-SD-NEXT:    ushr v3.4s, v3.4s, #2
+; CHECK-SD-NEXT:    ushr v2.4s, v2.4s, #2
+; CHECK-SD-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i32_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI78_0
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI78_0]
+; CHECK-GI-NEXT:    umull v4.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    umull v6.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umull v3.2d, v3.2s, v2.2s
+; CHECK-GI-NEXT:    umull v2.2d, v5.2s, v2.2s
+; CHECK-GI-NEXT:    shrn v4.2s, v4.2d, #32
+; CHECK-GI-NEXT:    shrn v5.2s, v6.2d, #32
+; CHECK-GI-NEXT:    mov v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.16b, v5.16b
+; CHECK-GI-NEXT:    shrn2 v4.4s, v3.2d, #32
+; CHECK-GI-NEXT:    shrn2 v5.4s, v2.2d, #32
+; CHECK-GI-NEXT:    shrn2 v6.4s, v3.2d, #32
+; CHECK-GI-NEXT:    shrn2 v7.4s, v2.2d, #32
+; CHECK-GI-NEXT:    movi v2.4s, #7
+; CHECK-GI-NEXT:    sub v6.4s, v0.4s, v6.4s
+; CHECK-GI-NEXT:    sub v7.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT:    usra v4.4s, v6.4s, #1
+; CHECK-GI-NEXT:    usra v5.4s, v7.4s, #1
+; CHECK-GI-NEXT:    ushr v3.4s, v4.4s, #2
+; CHECK-GI-NEXT:    ushr v4.4s, v5.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i32> %d, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %s
+}
+
+define <8 x i32> @uv8i32_100(<8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: uv8i32_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-SD-NEXT:    movk w8, #20971, lsl #16
+; CHECK-SD-NEXT:    dup v2.4s, w8
+; CHECK-SD-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    umull v4.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    umull2 v5.2d, v1.4s, v2.4s
+; CHECK-SD-NEXT:    umull v2.2d, v1.2s, v2.2s
+; CHECK-SD-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    movi v4.4s, #100
+; CHECK-SD-NEXT:    uzp2 v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    ushr v3.4s, v3.4s, #5
+; CHECK-SD-NEXT:    ushr v2.4s, v2.4s, #5
+; CHECK-SD-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    mls v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv8i32_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI79_0
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI79_0]
+; CHECK-GI-NEXT:    umull v5.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    umull v6.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umull v3.2d, v3.2s, v2.2s
+; CHECK-GI-NEXT:    umull v2.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    shrn v4.2s, v5.2d, #32
+; CHECK-GI-NEXT:    shrn v5.2s, v6.2d, #32
+; CHECK-GI-NEXT:    shrn2 v4.4s, v3.2d, #32
+; CHECK-GI-NEXT:    shrn2 v5.4s, v2.2d, #32
+; CHECK-GI-NEXT:    movi v2.4s, #100
+; CHECK-GI-NEXT:    ushr v3.4s, v4.4s, #5
+; CHECK-GI-NEXT:    ushr v4.4s, v5.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <8 x i32> %d, <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+  ret <8 x i32> %s
+}
+
+define <2 x i64> @sv2i64_7(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: sv2i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x10, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    asr x12, x11, #1
+; CHECK-SD-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-SD-NEXT:    asr x13, x8, #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    add x8, x13, x8, lsr #63
+; CHECK-SD-NEXT:    add x10, x10, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    add x8, x9, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x8, x10, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI80_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI80_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i64> %d, <i64 7, i64 7>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @sv2i64_100(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: sv2i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x10, x8
+; CHECK-SD-NEXT:    smulh x8, x9, x8
+; CHECK-SD-NEXT:    add x11, x11, x10
+; CHECK-SD-NEXT:    asr x12, x11, #6
+; CHECK-SD-NEXT:    add x8, x8, x9
+; CHECK-SD-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-SD-NEXT:    asr x13, x8, #6
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    msub x10, x11, x12, x10
+; CHECK-SD-NEXT:    add x8, x13, x8, lsr #63
+; CHECK-SD-NEXT:    msub x8, x8, x12, x9
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x8, x10, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI81_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI81_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i64> %d, <i64 100, i64 100>
+  ret <2 x i64> %s
+}
+
+define <3 x i64> @sv3i64_7(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: sv3i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x10, x9, x8
+; CHECK-SD-NEXT:    smulh x12, x11, x8
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    asr x14, x10, #1
+; CHECK-SD-NEXT:    asr x15, x12, #1
+; CHECK-SD-NEXT:    add x10, x14, x10, lsr #63
+; CHECK-SD-NEXT:    asr x16, x8, #1
+; CHECK-SD-NEXT:    add x12, x15, x12, lsr #63
+; CHECK-SD-NEXT:    sub x10, x10, x10, lsl #3
+; CHECK-SD-NEXT:    add x8, x16, x8, lsr #63
+; CHECK-SD-NEXT:    sub x12, x12, x12, lsl #3
+; CHECK-SD-NEXT:    add x9, x9, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x11, x12
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI82_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI82_0]
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x13, d4
+; CHECK-GI-NEXT:    mov x11, v4.d[1]
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sdiv x8, x9, x8
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x10, v3.d[1]
+; CHECK-GI-NEXT:    mul x12, x12, x13
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov d2, x12
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    lsl x10, x8, #3
+; CHECK-GI-NEXT:    sub x8, x10, x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i64> %d, <i64 7, i64 7, i64 7>
+  ret <3 x i64> %s
+}
+
+define <3 x i64> @sv3i64_100(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: sv3i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x10, x9, x8
+; CHECK-SD-NEXT:    smulh x12, x11, x8
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    add x10, x10, x9
+; CHECK-SD-NEXT:    asr x14, x10, #6
+; CHECK-SD-NEXT:    add x12, x12, x11
+; CHECK-SD-NEXT:    add x10, x14, x10, lsr #63
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    asr x15, x12, #6
+; CHECK-SD-NEXT:    add x8, x8, x13
+; CHECK-SD-NEXT:    msub x9, x10, x14, x9
+; CHECK-SD-NEXT:    asr x10, x8, #6
+; CHECK-SD-NEXT:    add x12, x15, x12, lsr #63
+; CHECK-SD-NEXT:    add x8, x10, x8, lsr #63
+; CHECK-SD-NEXT:    msub x10, x12, x14, x11
+; CHECK-SD-NEXT:    msub x8, x8, x14, x13
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv3i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI83_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI83_0]
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x14, d4
+; CHECK-GI-NEXT:    mov x12, v4.d[1]
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sdiv x10, x9, x8
+; CHECK-GI-NEXT:    fmov x13, d3
+; CHECK-GI-NEXT:    mov x11, v3.d[1]
+; CHECK-GI-NEXT:    mul x13, x13, x14
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    fmov d2, x13
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    msub x8, x10, x8, x9
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <3 x i64> %d, <i64 100, i64 100, i64 100>
+  ret <3 x i64> %s
+}
+
+define <4 x i64> @sv4i64_7(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: sv4i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #32
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x9, x8
+; CHECK-SD-NEXT:    smulh x14, x12, x8
+; CHECK-SD-NEXT:    smulh x15, x10, x8
+; CHECK-SD-NEXT:    asr x16, x11, #1
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    asr x17, x14, #1
+; CHECK-SD-NEXT:    add x11, x16, x11, lsr #63
+; CHECK-SD-NEXT:    add x14, x17, x14, lsr #63
+; CHECK-SD-NEXT:    asr x18, x15, #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    asr x0, x8, #1
+; CHECK-SD-NEXT:    sub x14, x14, x14, lsl #3
+; CHECK-SD-NEXT:    add x15, x18, x15, lsr #63
+; CHECK-SD-NEXT:    add x9, x9, x11
+; CHECK-SD-NEXT:    add x8, x0, x8, lsr #63
+; CHECK-SD-NEXT:    add x11, x12, x14
+; CHECK-SD-NEXT:    sub x15, x15, x15, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    fmov d1, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x10, x15
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x12, v1.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x11, x11, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI84_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI84_0]
+; CHECK-GI-NEXT:    fmov x9, d4
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov x11, v4.d[1]
+; CHECK-GI-NEXT:    sdiv x8, x12, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov x10, v2.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x12, x9
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mul x11, x13, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i64> %d, <i64 7, i64 7, i64 7, i64 7>
+  ret <4 x i64> %s
+}
+
+define <4 x i64> @sv4i64_100(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: sv4i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #28835, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #2621, lsl #32
+; CHECK-SD-NEXT:    movk x8, #41943, lsl #48
+; CHECK-SD-NEXT:    smulh x11, x9, x8
+; CHECK-SD-NEXT:    smulh x14, x12, x8
+; CHECK-SD-NEXT:    smulh x15, x10, x8
+; CHECK-SD-NEXT:    add x11, x11, x9
+; CHECK-SD-NEXT:    smulh x8, x13, x8
+; CHECK-SD-NEXT:    asr x16, x11, #6
+; CHECK-SD-NEXT:    add x14, x14, x12
+; CHECK-SD-NEXT:    asr x17, x14, #6
+; CHECK-SD-NEXT:    add x11, x16, x11, lsr #63
+; CHECK-SD-NEXT:    mov w16, #100 // =0x64
+; CHECK-SD-NEXT:    add x15, x15, x10
+; CHECK-SD-NEXT:    add x14, x17, x14, lsr #63
+; CHECK-SD-NEXT:    msub x9, x11, x16, x9
+; CHECK-SD-NEXT:    asr x11, x15, #6
+; CHECK-SD-NEXT:    add x8, x8, x13
+; CHECK-SD-NEXT:    msub x12, x14, x16, x12
+; CHECK-SD-NEXT:    asr x14, x8, #6
+; CHECK-SD-NEXT:    add x11, x11, x15, lsr #63
+; CHECK-SD-NEXT:    add x8, x14, x8, lsr #63
+; CHECK-SD-NEXT:    msub x10, x11, x16, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    msub x8, x8, x16, x13
+; CHECK-SD-NEXT:    fmov d1, x12
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv4i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x12, v1.d[1]
+; CHECK-GI-NEXT:    sdiv x9, x9, x8
+; CHECK-GI-NEXT:    sdiv x11, x11, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI85_0
+; CHECK-GI-NEXT:    ldr q4, [x9, :lo12:.LCPI85_0]
+; CHECK-GI-NEXT:    fmov x9, d4
+; CHECK-GI-NEXT:    sdiv x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov x11, v4.d[1]
+; CHECK-GI-NEXT:    sdiv x8, x12, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov x10, v2.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x12, x9
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mul x11, x13, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <4 x i64> %d, <i64 100, i64 100, i64 100, i64 100>
+  ret <4 x i64> %s
+}
+
+define <2 x i64> @uv2i64_7(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: uv2i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x11, x10, x8
+; CHECK-SD-NEXT:    umulh x8, x9, x8
+; CHECK-SD-NEXT:    sub x12, x10, x11
+; CHECK-SD-NEXT:    add x11, x11, x12, lsr #1
+; CHECK-SD-NEXT:    sub x12, x9, x8
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    add x8, x8, x12, lsr #1
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    add x10, x10, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    add x8, x9, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x9, v0.d[1]
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    umulh x8, x9, x8
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
+; CHECK-GI-NEXT:    sub v2.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    usra v1.2d, v2.2d, #1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI86_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i64> %d, <i64 7, i64 7>
+  ret <2 x i64> %s
+}
+
+define <2 x i64> @uv2i64_100(<2 x i64> %d, <2 x i64> %e) {
+; CHECK-SD-LABEL: uv2i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    movk x8, #23592, lsl #16
+; CHECK-SD-NEXT:    movk x8, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x11, x10, #2
+; CHECK-SD-NEXT:    movk x8, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x12, x9, #2
+; CHECK-SD-NEXT:    umulh x11, x11, x8
+; CHECK-SD-NEXT:    umulh x8, x12, x8
+; CHECK-SD-NEXT:    mov w12, #100 // =0x64
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    msub x10, x11, x12, x10
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    msub x8, x8, x12, x9
+; CHECK-SD-NEXT:    fmov d0, x10
+; CHECK-SD-NEXT:    mov v0.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv2i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.2d, v0.2d, #2
+; CHECK-GI-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    umulh x8, x9, x8
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI87_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI87_0]
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mov x8, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov d1, x10
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <2 x i64> %d, <i64 100, i64 100>
+  ret <2 x i64> %s
+}
+
+define <3 x i64> @uv3i64_7(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: uv3i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x10, x9, x8
+; CHECK-SD-NEXT:    umulh x12, x11, x8
+; CHECK-SD-NEXT:    umulh x8, x13, x8
+; CHECK-SD-NEXT:    sub x14, x9, x10
+; CHECK-SD-NEXT:    add x10, x10, x14, lsr #1
+; CHECK-SD-NEXT:    sub x15, x11, x12
+; CHECK-SD-NEXT:    add x12, x12, x15, lsr #1
+; CHECK-SD-NEXT:    lsr x10, x10, #2
+; CHECK-SD-NEXT:    sub x16, x13, x8
+; CHECK-SD-NEXT:    add x8, x8, x16, lsr #1
+; CHECK-SD-NEXT:    lsr x12, x12, #2
+; CHECK-SD-NEXT:    sub x10, x10, x10, lsl #3
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    sub x12, x12, x12, lsl #3
+; CHECK-SD-NEXT:    add x9, x9, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x11, x12
+; CHECK-SD-NEXT:    fmov d1, x10
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    fmov d2, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    mov v4.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x9, x9, x8
+; CHECK-GI-NEXT:    mov v4.d[1], v1.d[0]
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    fmov d3, x9
+; CHECK-GI-NEXT:    fmov d5, x9
+; CHECK-GI-NEXT:    adrp x9, .LCPI88_0
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    mov v5.d[1], x10
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    ldr q3, [x9, :lo12:.LCPI88_0]
+; CHECK-GI-NEXT:    mov x10, v3.d[1]
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    usra v5.2d, v0.2d, #1
+; CHECK-GI-NEXT:    ushr v0.2d, v5.2d, #2
+; CHECK-GI-NEXT:    mov x9, v0.d[1]
+; CHECK-GI-NEXT:    fmov x11, d0
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    umulh x8, x10, x8
+; CHECK-GI-NEXT:    fmov d0, x11
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    sub x9, x10, x8
+; CHECK-GI-NEXT:    add x8, x8, x9, lsr #1
+; CHECK-GI-NEXT:    sub v0.2d, v4.2d, v0.2d
+; CHECK-GI-NEXT:    lsr x8, x8, #2
+; CHECK-GI-NEXT:    lsl x9, x8, #3
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    sub x8, x10, x8
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i64> %d, <i64 7, i64 7, i64 7>
+  ret <3 x i64> %s
+}
+
+define <3 x i64> @uv3i64_100(<3 x i64> %d, <3 x i64> %e) {
+; CHECK-SD-LABEL: uv3i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    mov x10, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    movk x10, #23592, lsl #16
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    movk x10, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x9, x8, #2
+; CHECK-SD-NEXT:    movk x10, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x12, x11, #2
+; CHECK-SD-NEXT:    lsr x14, x13, #2
+; CHECK-SD-NEXT:    umulh x9, x9, x10
+; CHECK-SD-NEXT:    umulh x12, x12, x10
+; CHECK-SD-NEXT:    umulh x10, x14, x10
+; CHECK-SD-NEXT:    mov w14, #100 // =0x64
+; CHECK-SD-NEXT:    lsr x9, x9, #2
+; CHECK-SD-NEXT:    msub x8, x9, x14, x8
+; CHECK-SD-NEXT:    lsr x9, x12, #2
+; CHECK-SD-NEXT:    lsr x10, x10, #2
+; CHECK-SD-NEXT:    msub x9, x9, x14, x11
+; CHECK-SD-NEXT:    msub x10, x10, x14, x13
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    fmov d1, x9
+; CHECK-SD-NEXT:    fmov d2, x10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv3i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v3.16b, v0.16b
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov x9, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x9, #23592, lsl #16
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    movk x9, #49807, lsl #32
+; CHECK-GI-NEXT:    mov v3.d[1], v1.d[0]
+; CHECK-GI-NEXT:    movk x9, #10485, lsl #48
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    mov x8, v3.d[1]
+; CHECK-GI-NEXT:    umulh x10, x10, x9
+; CHECK-GI-NEXT:    umulh x8, x8, x9
+; CHECK-GI-NEXT:    fmov d3, x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI89_0
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI89_0]
+; CHECK-GI-NEXT:    mov x10, v4.d[1]
+; CHECK-GI-NEXT:    fmov x12, d4
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    mov x8, v3.d[1]
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mul x8, x8, x10
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    lsr x12, x10, #2
+; CHECK-GI-NEXT:    fmov d2, x11
+; CHECK-GI-NEXT:    umulh x9, x12, x9
+; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    lsr x8, x9, #2
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    msub x8, x8, x9, x10
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <3 x i64> %d, <i64 100, i64 100, i64 100>
+  ret <3 x i64> %s
+}
+
+define <4 x i64> @uv4i64_7(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: uv4i64_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #37449, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x11, x9, x8
+; CHECK-SD-NEXT:    umulh x14, x12, x8
+; CHECK-SD-NEXT:    umulh x15, x10, x8
+; CHECK-SD-NEXT:    sub x16, x9, x11
+; CHECK-SD-NEXT:    umulh x8, x13, x8
+; CHECK-SD-NEXT:    add x11, x11, x16, lsr #1
+; CHECK-SD-NEXT:    sub x17, x12, x14
+; CHECK-SD-NEXT:    add x14, x14, x17, lsr #1
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    sub x16, x10, x15
+; CHECK-SD-NEXT:    add x15, x15, x16, lsr #1
+; CHECK-SD-NEXT:    lsr x14, x14, #2
+; CHECK-SD-NEXT:    sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT:    sub x16, x13, x8
+; CHECK-SD-NEXT:    add x8, x8, x16, lsr #1
+; CHECK-SD-NEXT:    sub x14, x14, x14, lsl #3
+; CHECK-SD-NEXT:    lsr x15, x15, #2
+; CHECK-SD-NEXT:    add x9, x9, x11
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    add x11, x12, x14
+; CHECK-SD-NEXT:    sub x15, x15, x15, lsl #3
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    fmov d1, x11
+; CHECK-SD-NEXT:    sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT:    add x10, x10, x15
+; CHECK-SD-NEXT:    add x8, x13, x8
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i64_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov x8, #9363 // =0x2493
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #16
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x12, v1.d[1]
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #32
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #48
+; CHECK-GI-NEXT:    umulh x9, x9, x8
+; CHECK-GI-NEXT:    umulh x11, x11, x8
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    umulh x8, x12, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI90_0
+; CHECK-GI-NEXT:    sub v4.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v5.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    usra v2.2d, v4.2d, #1
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI90_0]
+; CHECK-GI-NEXT:    usra v3.2d, v5.2d, #1
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    mov x10, v4.d[1]
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #2
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    mul x8, x12, x8
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    mul x10, x13, x10
+; CHECK-GI-NEXT:    fmov d3, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i64> %d, <i64 7, i64 7, i64 7, i64 7>
+  ret <4 x i64> %s
+}
+
+define <4 x i64> @uv4i64_100(<4 x i64> %d, <4 x i64> %e) {
+; CHECK-SD-LABEL: uv4i64_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-SD-NEXT:    fmov x12, d1
+; CHECK-SD-NEXT:    movk x8, #23592, lsl #16
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    mov x13, v1.d[1]
+; CHECK-SD-NEXT:    movk x8, #49807, lsl #32
+; CHECK-SD-NEXT:    lsr x11, x9, #2
+; CHECK-SD-NEXT:    movk x8, #10485, lsl #48
+; CHECK-SD-NEXT:    lsr x14, x12, #2
+; CHECK-SD-NEXT:    umulh x11, x11, x8
+; CHECK-SD-NEXT:    lsr x15, x10, #2
+; CHECK-SD-NEXT:    lsr x16, x13, #2
+; CHECK-SD-NEXT:    umulh x14, x14, x8
+; CHECK-SD-NEXT:    umulh x15, x15, x8
+; CHECK-SD-NEXT:    lsr x11, x11, #2
+; CHECK-SD-NEXT:    umulh x8, x16, x8
+; CHECK-SD-NEXT:    mov w16, #100 // =0x64
+; CHECK-SD-NEXT:    msub x9, x11, x16, x9
+; CHECK-SD-NEXT:    lsr x11, x14, #2
+; CHECK-SD-NEXT:    msub x11, x11, x16, x12
+; CHECK-SD-NEXT:    lsr x12, x15, #2
+; CHECK-SD-NEXT:    lsr x8, x8, #2
+; CHECK-SD-NEXT:    msub x10, x12, x16, x10
+; CHECK-SD-NEXT:    fmov d0, x9
+; CHECK-SD-NEXT:    msub x8, x8, x16, x13
+; CHECK-SD-NEXT:    fmov d1, x11
+; CHECK-SD-NEXT:    mov v0.d[1], x10
+; CHECK-SD-NEXT:    mov v1.d[1], x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uv4i64_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v2.2d, v0.2d, #2
+; CHECK-GI-NEXT:    ushr v3.2d, v1.2d, #2
+; CHECK-GI-NEXT:    mov x8, #62915 // =0xf5c3
+; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    movk x8, #10485, lsl #48
+; CHECK-GI-NEXT:    mov x10, v2.d[1]
+; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    umulh x9, x9, x8
+; CHECK-GI-NEXT:    umulh x11, x11, x8
+; CHECK-GI-NEXT:    umulh x10, x10, x8
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    umulh x8, x12, x8
+; CHECK-GI-NEXT:    fmov d3, x11
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    mov v3.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI91_0
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI91_0]
+; CHECK-GI-NEXT:    ushr v2.2d, v2.2d, #2
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    mov x10, v4.d[1]
+; CHECK-GI-NEXT:    ushr v3.2d, v3.2d, #2
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    fmov x12, d3
+; CHECK-GI-NEXT:    mov x13, v3.d[1]
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    mul x8, x12, x8
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    mul x10, x13, x10
+; CHECK-GI-NEXT:    fmov d3, x8
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = urem <4 x i64> %d, <i64 100, i64 100, i64 100, i64 100>
+  ret <4 x i64> %s
+}
+
+define <2 x i128> @sv2i128_7(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: sv2i128_7:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #7 // =0x7
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128_7:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #7 // =0x7
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i128> %d, <i128 7, i128 7>
+  ret <2 x i128> %s
+}
+
+define <2 x i128> @sv2i128_100(<2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: sv2i128_100:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w30, -48
+; CHECK-SD-NEXT:    mov x19, x3
+; CHECK-SD-NEXT:    mov x20, x2
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    mov x22, x1
+; CHECK-SD-NEXT:    mov x0, x20
+; CHECK-SD-NEXT:    mov x1, x19
+; CHECK-SD-NEXT:    mov w2, #100 // =0x64
+; CHECK-SD-NEXT:    mov x3, xzr
+; CHECK-SD-NEXT:    bl __modti3
+; CHECK-SD-NEXT:    mov x2, x0
+; CHECK-SD-NEXT:    mov x3, x1
+; CHECK-SD-NEXT:    mov x0, x21
+; CHECK-SD-NEXT:    mov x1, x22
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sv2i128_100:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w30, -48
+; CHECK-GI-NEXT:    mov x19, x2
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    mov x22, x1
+; CHECK-GI-NEXT:    mov x0, x19
+; CHECK-GI-NEXT:    mov x1, x20
+; CHECK-GI-NEXT:    mov w2, #100 // =0x64
+; CHECK-GI-NEXT:    mov x3, xzr
+; CHECK-GI-NEXT:    bl __modti3
+; CHECK-GI-NEXT:    mov x2, x0
+; CHECK-GI-NEXT:    mov x3, x1
+; CHECK-GI-NEXT:    mov x0, x21
+; CHECK-GI-NEXT:    mov x1, x22
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %s = srem <2 x i128> %d, <i128 100, i128 100>
+  ret <2 x i128> %s
+}
+
+define <3 x i128> @sv3i128_7(<3 x i128> %d, <3 x i128> %e) {
----------------
davemgreen wrote:

This file is fairly long - because of the way we legalize nodes we can avoid testing some of the large special cases I think, and remove the > 2 x i128 cases. They should just be the same as 2xi128 but larger and the actual legalization is tested elsewhere.

https://github.com/llvm/llvm-project/pull/145914


More information about the llvm-commits mailing list