[llvm] r364631 - [ARM] Mark div and rem as expand for MVE

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 28 01:18:56 PDT 2019


Author: dmgreen
Date: Fri Jun 28 01:18:55 2019
New Revision: 364631

URL: http://llvm.org/viewvc/llvm-project?rev=364631&view=rev
Log:
[ARM] Mark div and rem as expand for MVE

We don't have vector operations for these, so they need to be expanded for both
integer and float.

Differential Revision: https://reviews.llvm.org/D63595

Added:
    llvm/trunk/test/CodeGen/Thumb2/mve-div-expand.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=364631&r1=364630&r2=364631&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Fri Jun 28 01:18:55 2019
@@ -235,6 +235,12 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+
+    // No native support for these.
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
   }
 
   const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
@@ -252,6 +258,12 @@ void ARMTargetLowering::addMVEVectorType
     setOperationAction(ISD::BITCAST, VT, Legal);
     setOperationAction(ISD::LOAD, VT, Legal);
     setOperationAction(ISD::STORE, VT, Legal);
+
+    if (HasMVEFP) {
+      // No native support for these.
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+    }
   }
 
   // We 'support' these types up to bitcast/load/store level, regardless of

Added: llvm/trunk/test/CodeGen/Thumb2/mve-div-expand.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-div-expand.ll?rev=364631&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-div-expand.ll (added)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-div-expand.ll Fri Jun 28 01:18:55 2019
@@ -0,0 +1,1243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
+
+define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
+; CHECK-LABEL: udiv_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    udiv r1, r2, r1
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %out = udiv <4 x i32> %in1, %in2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) {
+; CHECK-LABEL: sdiv_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    sdiv r0, r1, r0
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    sdiv r1, r2, r1
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    sdiv r0, r1, r0
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    sdiv r0, r1, r0
+; CHECK-NEXT:    vmov.32 q2[3], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %out = sdiv <4 x i32> %in1, %in2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) {
+; CHECK-LABEL: urem_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    udiv r2, r1, r0
+; CHECK-NEXT:    mls r12, r2, r0, r1
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    udiv r3, r2, r1
+; CHECK-NEXT:    mls lr, r3, r1, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    udiv r0, r3, r2
+; CHECK-NEXT:    mls r0, r0, r2, r3
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    udiv r1, r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    mls r1, r1, r2, r3
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %out = urem <4 x i32> %in1, %in2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) {
+; CHECK-LABEL: srem_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    sdiv r2, r1, r0
+; CHECK-NEXT:    mls r12, r2, r0, r1
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    sdiv r3, r2, r1
+; CHECK-NEXT:    mls lr, r3, r1, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    sdiv r0, r3, r2
+; CHECK-NEXT:    mls r0, r0, r2, r3
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    sdiv r1, r3, r2
+; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    mls r1, r1, r2, r3
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %out = srem <4 x i32> %in1, %in2
+  ret <4 x i32> %out
+}
+
+
+define arm_aapcs_vfpcc <8 x i16> @udiv_i16(<8 x i16> %in1, <8 x i16> %in2) {
+; CHECK-LABEL: udiv_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    udiv r1, r2, r1
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q2[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %out = udiv <8 x i16> %in1, %in2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sdiv_i16(<8 x i16> %in1, <8 x i16> %in2) {
+; CHECK-LABEL: sdiv_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    sdiv r12, r1, r0
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    vmov.u16 r4, q1[6]
+; CHECK-NEXT:    sdiv r3, r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    vmov.u16 r5, q0[6]
+; CHECK-NEXT:    sdiv r0, r2, r1
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    sdiv r1, r2, r1
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.16 q2[0], r1
+; CHECK-NEXT:    sxth.w lr, r2
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.16 q2[2], r3
+; CHECK-NEXT:    vmov.u16 r3, q1[4]
+; CHECK-NEXT:    sxth r6, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    sxth r3, r3
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmov.16 q2[3], r12
+; CHECK-NEXT:    sdiv r2, r2, r3
+; CHECK-NEXT:    sxth r5, r5
+; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    sdiv r0, r0, r1
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    sdiv r0, r5, r4
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    sdiv r0, r6, lr
+; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %out = sdiv <8 x i16> %in1, %in2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @urem_i16(<8 x i16> %in1, <8 x i16> %in2) {
+; CHECK-LABEL: urem_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    udiv r2, r1, r0
+; CHECK-NEXT:    mls r12, r2, r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    udiv r3, r2, r1
+; CHECK-NEXT:    mls lr, r3, r1, r2
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.u16 r3, q0[4]
+; CHECK-NEXT:    udiv r0, r3, r2
+; CHECK-NEXT:    mls r2, r0, r2, r3
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r3, q0[5]
+; CHECK-NEXT:    udiv r1, r3, r0
+; CHECK-NEXT:    mls r0, r1, r0, r3
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[2]
+; CHECK-NEXT:    udiv r4, r3, r1
+; CHECK-NEXT:    mls r1, r4, r1, r3
+; CHECK-NEXT:    vmov.u16 r3, q1[3]
+; CHECK-NEXT:    vmov.u16 r4, q0[3]
+; CHECK-NEXT:    udiv r5, r4, r3
+; CHECK-NEXT:    mls r3, r5, r3, r4
+; CHECK-NEXT:    vmov.u16 r4, q1[0]
+; CHECK-NEXT:    vmov.u16 r5, q0[0]
+; CHECK-NEXT:    udiv r6, r5, r4
+; CHECK-NEXT:    mls r4, r6, r4, r5
+; CHECK-NEXT:    vmov.u16 r6, q0[1]
+; CHECK-NEXT:    vmov.u16 r5, q1[1]
+; CHECK-NEXT:    udiv r7, r6, r5
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    mls r5, r7, r5, r6
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r1
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.16 q0[6], r12
+; CHECK-NEXT:    vmov.16 q0[7], lr
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+  %out = urem <8 x i16> %in1, %in2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @srem_i16(<8 x i16> %in1, <8 x i16> %in2) {
+; CHECK-LABEL: srem_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    vmov.u16 r5, q1[6]
+; CHECK-NEXT:    vmov.u16 r6, q0[6]
+; CHECK-NEXT:    sxth r5, r5
+; CHECK-NEXT:    sxth r6, r6
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    sdiv r7, r6, r5
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    sxth.w r8, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    mls r12, r7, r5, r6
+; CHECK-NEXT:    vmov.u16 r7, q0[7]
+; CHECK-NEXT:    sxth r3, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sxth r7, r7
+; CHECK-NEXT:    sxth r4, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    sdiv r6, r7, r2
+; CHECK-NEXT:    mls lr, r6, r2, r7
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    sxth r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sdiv r5, r2, r0
+; CHECK-NEXT:    vmov.u16 r6, q0[1]
+; CHECK-NEXT:    mls r0, r5, r0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[5]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sdiv r5, r2, r1
+; CHECK-NEXT:    sxth r6, r6
+; CHECK-NEXT:    mls r1, r5, r1, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    sxth r2, r2
+; CHECK-NEXT:    sdiv r5, r2, r4
+; CHECK-NEXT:    mls r2, r5, r4, r2
+; CHECK-NEXT:    vmov.u16 r4, q0[3]
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    sdiv r5, r4, r3
+; CHECK-NEXT:    mls r3, r5, r3, r4
+; CHECK-NEXT:    vmov.u16 r4, q0[0]
+; CHECK-NEXT:    sxth r4, r4
+; CHECK-NEXT:    sdiv r5, r4, r8
+; CHECK-NEXT:    mls r4, r5, r8, r4
+; CHECK-NEXT:    vmov.u16 r5, q1[1]
+; CHECK-NEXT:    sxth r5, r5
+; CHECK-NEXT:    sdiv r7, r6, r5
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    mls r5, r7, r5, r6
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r12
+; CHECK-NEXT:    vmov.16 q0[7], lr
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+entry:
+  %out = srem <8 x i16> %in1, %in2
+  ret <8 x i16> %out
+}
+
+
+define arm_aapcs_vfpcc <16 x i8> @udiv_i8(<16 x i8> %in1, <16 x i8> %in2) {
+; CHECK-LABEL: udiv_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    vmov.8 q2[0], r0
+; CHECK-NEXT:    udiv r1, r2, r1
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.8 q2[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.8 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.8 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov.8 q2[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
+; CHECK-NEXT:    vmov.8 q2[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
+; CHECK-NEXT:    vmov.8 q2[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmov.8 q2[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.8 q2[8], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.8 q2[9], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.8 q2[11], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.8 q2[12], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.8 q2[13], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.8 q2[14], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    udiv r0, r1, r0
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %out = udiv <16 x i8> %in1, %in2
+  ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @sdiv_i8(<16 x i8> %in1, <16 x i8> %in2) {
+; CHECK-LABEL: sdiv_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    sdiv r0, r1, r0
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov.u8 r4, q1[3]
+; CHECK-NEXT:    sdiv r1, r2, r1
+; CHECK-NEXT:    vmov.u8 r5, q0[3]
+; CHECK-NEXT:    vmov.8 q2[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
+; CHECK-NEXT:    vmov.8 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.u8 r2, q1[11]
+; CHECK-NEXT:    vmov.u8 r3, q0[11]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sdiv r0, r0, r1
+; CHECK-NEXT:    sxtb.w r12, r2
+; CHECK-NEXT:    sxtb.w lr, r3
+; CHECK-NEXT:    vmov.u8 r2, q1[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[4]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[2], r0
+; CHECK-NEXT:    sdiv r0, r5, r4
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.8 q2[3], r0
+; CHECK-NEXT:    sdiv r0, r3, r2
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.8 q2[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sdiv r12, lr, r12
+; CHECK-NEXT:    sdiv lr, r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sdiv r2, r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    vmov.u8 r3, q0[7]
+; CHECK-NEXT:    sdiv r1, r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sdiv r4, r3, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.u8 r3, q0[6]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.u8 r6, q0[12]
+; CHECK-NEXT:    sdiv r5, r3, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.u8 r3, q0[5]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    sdiv r0, r3, r0
+; CHECK-NEXT:    vmov.u8 r3, q1[15]
+; CHECK-NEXT:    vmov.8 q2[5], r0
+; CHECK-NEXT:    sxtb r7, r3
+; CHECK-NEXT:    vmov.8 q2[6], r5
+; CHECK-NEXT:    vmov.u8 r3, q1[12]
+; CHECK-NEXT:    vmov.8 q2[7], r4
+; CHECK-NEXT:    sxtb r3, r3
+; CHECK-NEXT:    vmov.8 q2[8], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.8 q2[9], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    vmov.8 q2[10], lr
+; CHECK-NEXT:    vmov.u8 r5, q1[14]
+; CHECK-NEXT:    vmov.u8 r4, q0[14]
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.8 q2[11], r12
+; CHECK-NEXT:    sdiv r3, r6, r3
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.8 q2[12], r3
+; CHECK-NEXT:    sdiv r1, r2, r1
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmov.8 q2[13], r1
+; CHECK-NEXT:    sdiv r1, r4, r5
+; CHECK-NEXT:    sdiv r0, r0, r7
+; CHECK-NEXT:    vmov.8 q2[14], r1
+; CHECK-NEXT:    vmov.8 q2[15], r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+  %out = sdiv <16 x i8> %in1, %in2
+  ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @urem_i8(<16 x i8> %in1, <16 x i8> %in2) {
+; CHECK-LABEL: urem_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    udiv r2, r1, r0
+; CHECK-NEXT:    mls r12, r2, r0, r1
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    udiv r2, r1, r0
+; CHECK-NEXT:    mls lr, r2, r0, r1
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    udiv r2, r1, r0
+; CHECK-NEXT:    mls r8, r2, r0, r1
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    udiv r3, r1, r0
+; CHECK-NEXT:    mls r3, r3, r0, r1
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    udiv r4, r1, r0
+; CHECK-NEXT:    mls r0, r4, r0, r1
+; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    udiv r5, r4, r1
+; CHECK-NEXT:    mls r1, r5, r1, r4
+; CHECK-NEXT:    vmov.u8 r4, q1[8]
+; CHECK-NEXT:    vmov.u8 r5, q0[8]
+; CHECK-NEXT:    udiv r6, r5, r4
+; CHECK-NEXT:    mls r4, r6, r4, r5
+; CHECK-NEXT:    vmov.u8 r5, q1[0]
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    udiv r7, r6, r5
+; CHECK-NEXT:    mls r5, r7, r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[1]
+; CHECK-NEXT:    vmov.u8 r7, q0[1]
+; CHECK-NEXT:    udiv r2, r7, r6
+; CHECK-NEXT:    vmov.8 q2[0], r5
+; CHECK-NEXT:    mls r2, r2, r6, r7
+; CHECK-NEXT:    vmov.u8 r5, q0[2]
+; CHECK-NEXT:    vmov.8 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[2]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[3]
+; CHECK-NEXT:    vmov.8 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    vmov.8 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[4]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    vmov.8 q2[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[6]
+; CHECK-NEXT:    vmov.8 q2[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[6]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[7]
+; CHECK-NEXT:    vmov.8 q2[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[9]
+; CHECK-NEXT:    vmov.8 q2[7], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    udiv r6, r5, r2
+; CHECK-NEXT:    vmov.8 q2[8], r4
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.8 q2[9], r2
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.8 q2[11], r1
+; CHECK-NEXT:    vmov.8 q2[12], r8
+; CHECK-NEXT:    vmov.8 q2[13], r3
+; CHECK-NEXT:    vmov.8 q2[14], r12
+; CHECK-NEXT:    vmov.8 q2[15], lr
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+entry:
+  %out = urem <16 x i8> %in1, %in2
+  ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @srem_i8(<16 x i8> %in1, <16 x i8> %in2) {
+; CHECK-LABEL: srem_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    vmov.u8 r5, q1[14]
+; CHECK-NEXT:    vmov.u8 r6, q0[14]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    sdiv r7, r6, r5
+; CHECK-NEXT:    vmov.u8 r4, q1[15]
+; CHECK-NEXT:    mls r12, r7, r5, r6
+; CHECK-NEXT:    vmov.u8 r7, q0[15]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.u8 r2, q1[13]
+; CHECK-NEXT:    sxtb r7, r7
+; CHECK-NEXT:    sxtb r3, r2
+; CHECK-NEXT:    sdiv r6, r7, r4
+; CHECK-NEXT:    vmov.u8 r2, q1[12]
+; CHECK-NEXT:    mls lr, r6, r4, r7
+; CHECK-NEXT:    vmov.u8 r4, q0[12]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sxtb.w r8, r0
+; CHECK-NEXT:    sdiv r5, r4, r2
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    mls r9, r5, r2, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[13]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    vmov.u8 r6, q0[0]
+; CHECK-NEXT:    sdiv r5, r4, r3
+; CHECK-NEXT:    sxtb r1, r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    mls r3, r5, r3, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[10]
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    sdiv r5, r4, r0
+; CHECK-NEXT:    mls r0, r5, r0, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[11]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sdiv r5, r4, r1
+; CHECK-NEXT:    mls r1, r5, r1, r4
+; CHECK-NEXT:    vmov.u8 r4, q0[8]
+; CHECK-NEXT:    sxtb r4, r4
+; CHECK-NEXT:    sdiv r5, r4, r8
+; CHECK-NEXT:    mls r4, r5, r8, r4
+; CHECK-NEXT:    vmov.u8 r5, q1[0]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    sdiv r7, r6, r5
+; CHECK-NEXT:    mls r5, r7, r5, r6
+; CHECK-NEXT:    vmov.u8 r6, q1[1]
+; CHECK-NEXT:    vmov.u8 r7, q0[1]
+; CHECK-NEXT:    sxtb r6, r6
+; CHECK-NEXT:    sxtb r7, r7
+; CHECK-NEXT:    vmov.8 q2[0], r5
+; CHECK-NEXT:    sdiv r2, r7, r6
+; CHECK-NEXT:    vmov.u8 r5, q0[2]
+; CHECK-NEXT:    mls r2, r2, r6, r7
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[2]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[3]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[3]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[4]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[4]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[5]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[4], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[5]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[6]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[5], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[6]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[7]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[6], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[7]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.u8 r5, q0[9]
+; CHECK-NEXT:    sxtb r5, r5
+; CHECK-NEXT:    vmov.8 q2[7], r2
+; CHECK-NEXT:    vmov.u8 r2, q1[9]
+; CHECK-NEXT:    sxtb r2, r2
+; CHECK-NEXT:    vmov.8 q2[8], r4
+; CHECK-NEXT:    sdiv r6, r5, r2
+; CHECK-NEXT:    mls r2, r6, r2, r5
+; CHECK-NEXT:    vmov.8 q2[9], r2
+; CHECK-NEXT:    vmov.8 q2[10], r0
+; CHECK-NEXT:    vmov.8 q2[11], r1
+; CHECK-NEXT:    vmov.8 q2[12], r9
+; CHECK-NEXT:    vmov.8 q2[13], r3
+; CHECK-NEXT:    vmov.8 q2[14], r12
+; CHECK-NEXT:    vmov.8 q2[15], lr
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+  %out = srem <16 x i8> %in1, %in2
+  ret <16 x i8> %out
+}
+
+
+define arm_aapcs_vfpcc <4 x float> @fdiv_f32(<4 x float> %in1, <4 x float> %in2) {
+; CHECK-MVE-LABEL: fdiv_f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vdiv.f32 s8, s0, s4
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vdiv.f32 s10, s1, s5
+; CHECK-MVE-NEXT:    vdiv.f32 s12, s2, s6
+; CHECK-MVE-NEXT:    vdiv.f32 s4, s3, s7
+; CHECK-MVE-NEXT:    vdup.32 q0, r0
+; CHECK-MVE-NEXT:    vmov.f32 s0, s8
+; CHECK-MVE-NEXT:    vmov.f32 s1, s10
+; CHECK-MVE-NEXT:    vmov.f32 s2, s12
+; CHECK-MVE-NEXT:    vmov.f32 s3, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: fdiv_f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov q2, q0
+; CHECK-MVEFP-NEXT:    vdiv.f32 s0, s8, s4
+; CHECK-MVEFP-NEXT:    vdiv.f32 s1, s9, s5
+; CHECK-MVEFP-NEXT:    vdiv.f32 s2, s10, s6
+; CHECK-MVEFP-NEXT:    vdiv.f32 s3, s11, s7
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %out = fdiv <4 x float> %in1, %in2
+  ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @frem_f32(<4 x float> %in1, <4 x float> %in2) {
+; CHECK-MVE-LABEL: frem_f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-MVE-NEXT:    push {r4, r5, r6, lr}
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    .pad #32
+; CHECK-MVE-NEXT:    sub sp, #32
+; CHECK-MVE-NEXT:    vstr s3, [sp, #24]
+; CHECK-MVE-NEXT:    ldr r4, [sp, #24]
+; CHECK-MVE-NEXT:    vstr s1, [sp, #8]
+; CHECK-MVE-NEXT:    vstr s5, [sp, #12]
+; CHECK-MVE-NEXT:    vstr s0, [sp]
+; CHECK-MVE-NEXT:    vstr s4, [sp, #4]
+; CHECK-MVE-NEXT:    vstr s7, [sp, #28]
+; CHECK-MVE-NEXT:    vstr s2, [sp, #16]
+; CHECK-MVE-NEXT:    vstr s6, [sp, #20]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #8]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    mov r5, r0
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    ldrd r2, r1, [sp, #16]
+; CHECK-MVE-NEXT:    vmov s16, r0
+; CHECK-MVE-NEXT:    ldr r6, [sp, #28]
+; CHECK-MVE-NEXT:    vmov s18, r5
+; CHECK-MVE-NEXT:    mov r0, r2
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s20, r0
+; CHECK-MVE-NEXT:    mov r0, r4
+; CHECK-MVE-NEXT:    mov r1, r6
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov s4, r0
+; CHECK-MVE-NEXT:    vdup.32 q0, r1
+; CHECK-MVE-NEXT:    vmov.f32 s0, s16
+; CHECK-MVE-NEXT:    vmov.f32 s1, s18
+; CHECK-MVE-NEXT:    vmov.f32 s2, s20
+; CHECK-MVE-NEXT:    vmov.f32 s3, s4
+; CHECK-MVE-NEXT:    add sp, #32
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; CHECK-MVEFP-LABEL: frem_f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-MVEFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-MVEFP-NEXT:    .vsave {d8, d9}
+; CHECK-MVEFP-NEXT:    vpush {d8, d9}
+; CHECK-MVEFP-NEXT:    .pad #32
+; CHECK-MVEFP-NEXT:    sub sp, #32
+; CHECK-MVEFP-NEXT:    vstr s3, [sp, #24]
+; CHECK-MVEFP-NEXT:    ldr r4, [sp, #24]
+; CHECK-MVEFP-NEXT:    vstr s1, [sp, #8]
+; CHECK-MVEFP-NEXT:    vstr s5, [sp, #12]
+; CHECK-MVEFP-NEXT:    vstr s0, [sp]
+; CHECK-MVEFP-NEXT:    vstr s4, [sp, #4]
+; CHECK-MVEFP-NEXT:    vstr s7, [sp, #28]
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #16]
+; CHECK-MVEFP-NEXT:    vstr s6, [sp, #20]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #8]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    mov r5, r0
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    ldrd r2, r1, [sp, #16]
+; CHECK-MVEFP-NEXT:    vmov s16, r0
+; CHECK-MVEFP-NEXT:    ldr r6, [sp, #28]
+; CHECK-MVEFP-NEXT:    vmov s17, r5
+; CHECK-MVEFP-NEXT:    mov r0, r2
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s18, r0
+; CHECK-MVEFP-NEXT:    mov r0, r4
+; CHECK-MVEFP-NEXT:    mov r1, r6
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s19, r0
+; CHECK-MVEFP-NEXT:    vmov q0, q4
+; CHECK-MVEFP-NEXT:    add sp, #32
+; CHECK-MVEFP-NEXT:    vpop {d8, d9}
+; CHECK-MVEFP-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %out = frem <4 x float> %in1, %in2
+  ret <4 x float> %out
+}
+
+
+define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
+; CHECK-MVE-LABEL: fdiv_f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-MVE-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-MVE-NEXT:    vmov s10, r1
+; CHECK-MVE-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vdiv.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vmov r0, s8
+; CHECK-MVE-NEXT:    vmov s8, r1
+; CHECK-MVE-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-MVE-NEXT:    vmov s10, r1
+; CHECK-MVE-NEXT:    vdiv.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vmov r1, s8
+; CHECK-MVE-NEXT:    vdup.16 q2, r2
+; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-MVE-NEXT:    vmov s12, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-MVE-NEXT:    vmov s14, r0
+; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
+; CHECK-MVE-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-MVE-NEXT:    vmov s12, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-MVE-NEXT:    vmov s14, r0
+; CHECK-MVE-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-MVE-NEXT:    vmov s12, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-MVE-NEXT:    vmov s14, r0
+; CHECK-MVE-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-MVE-NEXT:    vmov s12, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-MVE-NEXT:    vmov s14, r0
+; CHECK-MVE-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-MVE-NEXT:    vmov s12, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-MVE-NEXT:    vmov s14, r0
+; CHECK-MVE-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-MVE-NEXT:    vmov s4, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vdiv.f16 s0, s0, s4
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: fdiv_f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-MVEFP-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-MVEFP-NEXT:    vmov s8, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-MVEFP-NEXT:    vmov s10, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r2, q0[1]
+; CHECK-MVEFP-NEXT:    vdiv.f16 s8, s10, s8
+; CHECK-MVEFP-NEXT:    vmov s10, r2
+; CHECK-MVEFP-NEXT:    vmov r0, s8
+; CHECK-MVEFP-NEXT:    vmov s8, r1
+; CHECK-MVEFP-NEXT:    vdiv.f16 s8, s10, s8
+; CHECK-MVEFP-NEXT:    vmov r1, s8
+; CHECK-MVEFP-NEXT:    vmov.16 q2[0], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-MVEFP-NEXT:    vmov.16 q2[1], r1
+; CHECK-MVEFP-NEXT:    vmov s12, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-MVEFP-NEXT:    vmov s14, r0
+; CHECK-MVEFP-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vmov.16 q2[2], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-MVEFP-NEXT:    vmov s12, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-MVEFP-NEXT:    vmov s14, r0
+; CHECK-MVEFP-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vmov.16 q2[3], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-MVEFP-NEXT:    vmov s12, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-MVEFP-NEXT:    vmov s14, r0
+; CHECK-MVEFP-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vmov.16 q2[4], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-MVEFP-NEXT:    vmov s12, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-MVEFP-NEXT:    vmov s14, r0
+; CHECK-MVEFP-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vmov.16 q2[5], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-MVEFP-NEXT:    vmov s12, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-MVEFP-NEXT:    vmov s14, r0
+; CHECK-MVEFP-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vmov.16 q2[6], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-MVEFP-NEXT:    vmov s4, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vdiv.f16 s0, s0, s4
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVEFP-NEXT:    vmov q0, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %out = fdiv <8 x half> %in1, %in2
+  ret <8 x half> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
+; CHECK-MVE-LABEL: frem_f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .save {r4, lr}
+; CHECK-MVE-NEXT:    push {r4, lr}
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-MVE-NEXT:    .pad #64
+; CHECK-MVE-NEXT:    sub sp, #64
+; CHECK-MVE-NEXT:    vmov q5, q1
+; CHECK-MVE-NEXT:    vmov q4, q0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #56]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #60]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #56]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r4, s0
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #48]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #52]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #48]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vdup.16 q6, r1
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[0], r4
+; CHECK-MVE-NEXT:    vmov.16 q6[1], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #40]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #44]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #40]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[2], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #32]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #36]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #32]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[3], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #24]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #28]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #24]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[4], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #16]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #20]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #16]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[5], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp, #8]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #12]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp, #8]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[6], r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-MVE-NEXT:    vmov s2, r0
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVE-NEXT:    vstr s2, [sp]
+; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVE-NEXT:    vstr s0, [sp, #4]
+; CHECK-MVE-NEXT:    ldrd r0, r1, [sp]
+; CHECK-MVE-NEXT:    bl fmodf
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q6[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q6
+; CHECK-MVE-NEXT:    add sp, #64
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-MVE-NEXT:    pop {r4, pc}
+;
+; CHECK-MVEFP-LABEL: frem_f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    .save {r4, lr}
+; CHECK-MVEFP-NEXT:    push {r4, lr}
+; CHECK-MVEFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-MVEFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-MVEFP-NEXT:    .pad #64
+; CHECK-MVEFP-NEXT:    sub sp, #64
+; CHECK-MVEFP-NEXT:    vmov q5, q1
+; CHECK-MVEFP-NEXT:    vmov q4, q0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #56]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #60]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #56]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r4, s0
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #48]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #52]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #48]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[0], r4
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[1], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[2]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #40]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #44]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #40]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[2], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #32]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #36]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #32]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[3], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #24]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #28]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #24]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[4], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[5]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #16]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #20]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #16]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[5], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[6]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp, #8]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #12]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp, #8]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[6], r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-MVEFP-NEXT:    vmov s2, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s2, s2
+; CHECK-MVEFP-NEXT:    vstr s2, [sp]
+; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-MVEFP-NEXT:    vstr s0, [sp, #4]
+; CHECK-MVEFP-NEXT:    ldrd r0, r1, [sp]
+; CHECK-MVEFP-NEXT:    bl fmodf
+; CHECK-MVEFP-NEXT:    vmov s0, r0
+; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVEFP-NEXT:    vmov r0, s0
+; CHECK-MVEFP-NEXT:    vmov.16 q6[7], r0
+; CHECK-MVEFP-NEXT:    vmov q0, q6
+; CHECK-MVEFP-NEXT:    add sp, #64
+; CHECK-MVEFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-MVEFP-NEXT:    pop {r4, pc}
+entry:
+  %out = frem <8 x half> %in1, %in2
+  ret <8 x half> %out
+}




More information about the llvm-commits mailing list